aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib/memset.S
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2005-11-05 11:25:54 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2005-11-14 22:55:17 -0500
commita5b250a428aabc619ace872f8220a7d0b8f7d557 (patch)
tree11cabf07982ae37f94bc929f9a605cbbd20e35ab /arch/x86_64/lib/memset.S
parenta6f5deb2be4c82f24fefadcbf7e448f540c05ae6 (diff)
[PATCH] x86_64: Remove optimization for B stepping AMD K8
B stepping were the first shipping Opterons. memcpy/memset/copy_page/ clear_page had special optimized version for them. These are really old and in the minority now and the difference to the generic versions (using rep microcode) is not that big anyways. So just remove them. TODO: figure out optimized versions for Intel Netburst based EM64T Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/lib/memset.S')
-rw-r--r--arch/x86_64/lib/memset.S94
1 files changed, 0 insertions, 94 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 4b4c40638640..2aa48f24ed1e 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,98 +13,6 @@
13 .p2align 4 13 .p2align 4
14memset: 14memset:
15__memset: 15__memset:
16 movq %rdi,%r10
17 movq %rdx,%r11
18
19 /* expand byte value */
20 movzbl %sil,%ecx
21 movabs $0x0101010101010101,%rax
22 mul %rcx /* with rax, clobbers rdx */
23
24 /* align dst */
25 movl %edi,%r9d
26 andl $7,%r9d
27 jnz .Lbad_alignment
28.Lafter_bad_alignment:
29
30 movl %r11d,%ecx
31 shrl $6,%ecx
32 jz .Lhandle_tail
33
34 .p2align 4
35.Lloop_64:
36 decl %ecx
37 movq %rax,(%rdi)
38 movq %rax,8(%rdi)
39 movq %rax,16(%rdi)
40 movq %rax,24(%rdi)
41 movq %rax,32(%rdi)
42 movq %rax,40(%rdi)
43 movq %rax,48(%rdi)
44 movq %rax,56(%rdi)
45 leaq 64(%rdi),%rdi
46 jnz .Lloop_64
47
48 /* Handle tail in loops. The loops should be faster than hard
49 to predict jump tables. */
50 .p2align 4
51.Lhandle_tail:
52 movl %r11d,%ecx
53 andl $63&(~7),%ecx
54 jz .Lhandle_7
55 shrl $3,%ecx
56 .p2align 4
57.Lloop_8:
58 decl %ecx
59 movq %rax,(%rdi)
60 leaq 8(%rdi),%rdi
61 jnz .Lloop_8
62
63.Lhandle_7:
64 movl %r11d,%ecx
65 andl $7,%ecx
66 jz .Lende
67 .p2align 4
68.Lloop_1:
69 decl %ecx
70 movb %al,(%rdi)
71 leaq 1(%rdi),%rdi
72 jnz .Lloop_1
73
74.Lende:
75 movq %r10,%rax
76 ret
77
78.Lbad_alignment:
79 cmpq $7,%r11
80 jbe .Lhandle_7
81 movq %rax,(%rdi) /* unaligned store */
82 movq $8,%r8
83 subq %r9,%r8
84 addq %r8,%rdi
85 subq %r8,%r11
86 jmp .Lafter_bad_alignment
87
88 /* C stepping K8 run faster using the string instructions.
89 It is also a lot simpler. Use this when possible */
90
91#include <asm/cpufeature.h>
92
93 .section .altinstructions,"a"
94 .align 8
95 .quad memset
96 .quad memset_c
97 .byte X86_FEATURE_K8_C
98 .byte memset_c_end-memset_c
99 .byte memset_c_end-memset_c
100 .previous
101
102 .section .altinstr_replacement,"ax"
103 /* rdi destination
104 * rsi value
105 * rdx count
106 */
107memset_c:
108 movq %rdi,%r9 16 movq %rdi,%r9
109 movl %edx,%r8d 17 movl %edx,%r8d
110 andl $7,%r8d 18 andl $7,%r8d
@@ -121,5 +29,3 @@ memset_c:
121 stosb 29 stosb
122 movq %r9,%rax 30 movq %r9,%rax
123 ret 31 ret
124memset_c_end:
125 .previous