aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2006-02-03 15:51:02 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-02-04 19:43:13 -0500
commit7bcd3f34e262bbebffa954d80eab3a84f053da31 (patch)
treef0765da9eaa8024a2b1d67d3e43730cb32f99fa7 /arch/x86_64/lib
parent6bca52b544489b626c7d0db801df6b4aa3d5adb5 (diff)
[PATCH] x86_64: Undo the earlier changes to remove unrolled copy/memset functions
They cause quite bad performance regressions on Netburst This is temporary until we can get new optimized functions for these CPUs. This undoes changes that were done in 2.6.15 and in 2.6.16-rc1, essentially bringing the code back to 2.6.14 level. Only change is I renamed the X86_FEATURE_K8_C flag to X86_FEATURE_REP_GOOD and fixed the check for the flag and also fixed some comments. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/x86_64/lib')
-rw-r--r--arch/x86_64/lib/clear_page.S38
-rw-r--r--arch/x86_64/lib/copy_page.S87
-rw-r--r--arch/x86_64/lib/copy_user.S247
-rw-r--r--arch/x86_64/lib/memcpy.S93
-rw-r--r--arch/x86_64/lib/memset.S94
5 files changed, 536 insertions, 23 deletions
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
index 43d9fa136180..1f81b79b796c 100644
--- a/arch/x86_64/lib/clear_page.S
+++ b/arch/x86_64/lib/clear_page.S
@@ -5,8 +5,46 @@
5 .globl clear_page 5 .globl clear_page
6 .p2align 4 6 .p2align 4
7clear_page: 7clear_page:
8 xorl %eax,%eax
9 movl $4096/64,%ecx
10 .p2align 4
11.Lloop:
12 decl %ecx
13#define PUT(x) movq %rax,x*8(%rdi)
14 movq %rax,(%rdi)
15 PUT(1)
16 PUT(2)
17 PUT(3)
18 PUT(4)
19 PUT(5)
20 PUT(6)
21 PUT(7)
22 leaq 64(%rdi),%rdi
23 jnz .Lloop
24 nop
25 ret
26clear_page_end:
27
28 /* Some CPUs run faster using the string instructions.
29 It is also a lot simpler. Use this when possible */
30
31#include <asm/cpufeature.h>
32
33 .section .altinstructions,"a"
34 .align 8
35 .quad clear_page
36 .quad clear_page_c
37 .byte X86_FEATURE_REP_GOOD
38 .byte clear_page_end-clear_page
39 .byte clear_page_c_end-clear_page_c
40 .previous
41
42 .section .altinstr_replacement,"ax"
43clear_page_c:
8 movl $4096/8,%ecx 44 movl $4096/8,%ecx
9 xorl %eax,%eax 45 xorl %eax,%eax
10 rep 46 rep
11 stosq 47 stosq
12 ret 48 ret
49clear_page_c_end:
50 .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
index 621a19769406..8fa19d96a7ee 100644
--- a/arch/x86_64/lib/copy_page.S
+++ b/arch/x86_64/lib/copy_page.S
@@ -8,7 +8,94 @@
8 .globl copy_page 8 .globl copy_page
9 .p2align 4 9 .p2align 4
10copy_page: 10copy_page:
11 subq $3*8,%rsp
12 movq %rbx,(%rsp)
13 movq %r12,1*8(%rsp)
14 movq %r13,2*8(%rsp)
15
16 movl $(4096/64)-5,%ecx
17 .p2align 4
18.Loop64:
19 dec %rcx
20
21 movq (%rsi), %rax
22 movq 8 (%rsi), %rbx
23 movq 16 (%rsi), %rdx
24 movq 24 (%rsi), %r8
25 movq 32 (%rsi), %r9
26 movq 40 (%rsi), %r10
27 movq 48 (%rsi), %r11
28 movq 56 (%rsi), %r12
29
30 prefetcht0 5*64(%rsi)
31
32 movq %rax, (%rdi)
33 movq %rbx, 8 (%rdi)
34 movq %rdx, 16 (%rdi)
35 movq %r8, 24 (%rdi)
36 movq %r9, 32 (%rdi)
37 movq %r10, 40 (%rdi)
38 movq %r11, 48 (%rdi)
39 movq %r12, 56 (%rdi)
40
41 leaq 64 (%rsi), %rsi
42 leaq 64 (%rdi), %rdi
43
44 jnz .Loop64
45
46 movl $5,%ecx
47 .p2align 4
48.Loop2:
49 decl %ecx
50
51 movq (%rsi), %rax
52 movq 8 (%rsi), %rbx
53 movq 16 (%rsi), %rdx
54 movq 24 (%rsi), %r8
55 movq 32 (%rsi), %r9
56 movq 40 (%rsi), %r10
57 movq 48 (%rsi), %r11
58 movq 56 (%rsi), %r12
59
60 movq %rax, (%rdi)
61 movq %rbx, 8 (%rdi)
62 movq %rdx, 16 (%rdi)
63 movq %r8, 24 (%rdi)
64 movq %r9, 32 (%rdi)
65 movq %r10, 40 (%rdi)
66 movq %r11, 48 (%rdi)
67 movq %r12, 56 (%rdi)
68
69 leaq 64(%rdi),%rdi
70 leaq 64(%rsi),%rsi
71
72 jnz .Loop2
73
74 movq (%rsp),%rbx
75 movq 1*8(%rsp),%r12
76 movq 2*8(%rsp),%r13
77 addq $3*8,%rsp
78 ret
79
80 /* Some CPUs run faster using the string copy instructions.
81 It is also a lot simpler. Use this when possible */
82
83#include <asm/cpufeature.h>
84
85 .section .altinstructions,"a"
86 .align 8
87 .quad copy_page
88 .quad copy_page_c
89 .byte X86_FEATURE_REP_GOOD
90 .byte copy_page_c_end-copy_page_c
91 .byte copy_page_c_end-copy_page_c
92 .previous
93
94 .section .altinstr_replacement,"ax"
95copy_page_c:
11 movl $4096/8,%ecx 96 movl $4096/8,%ecx
12 rep 97 rep
13 movsq 98 movsq
14 ret 99 ret
100copy_page_c_end:
101 .previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index 79422b6559c3..f64569b83b54 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -4,9 +4,12 @@
4 * Functions to copy from and to user space. 4 * Functions to copy from and to user space.
5 */ 5 */
6 6
7#define FIX_ALIGNMENT 1
8
7 #include <asm/current.h> 9 #include <asm/current.h>
8 #include <asm/asm-offsets.h> 10 #include <asm/asm-offsets.h>
9 #include <asm/thread_info.h> 11 #include <asm/thread_info.h>
12 #include <asm/cpufeature.h>
10 13
11/* Standard copy_to_user with segment limit checking */ 14/* Standard copy_to_user with segment limit checking */
12 .globl copy_to_user 15 .globl copy_to_user
@@ -18,7 +21,23 @@ copy_to_user:
18 jc bad_to_user 21 jc bad_to_user
19 cmpq threadinfo_addr_limit(%rax),%rcx 22 cmpq threadinfo_addr_limit(%rax),%rcx
20 jae bad_to_user 23 jae bad_to_user
21 jmp copy_user_generic 242:
25 .byte 0xe9 /* 32bit jump */
26 .long .Lcug-1f
271:
28
29 .section .altinstr_replacement,"ax"
303: .byte 0xe9 /* replacement jmp with 8 bit immediate */
31 .long copy_user_generic_c-1b /* offset */
32 .previous
33 .section .altinstructions,"a"
34 .align 8
35 .quad 2b
36 .quad 3b
37 .byte X86_FEATURE_REP_GOOD
38 .byte 5
39 .byte 5
40 .previous
22 41
23/* Standard copy_from_user with segment limit checking */ 42/* Standard copy_from_user with segment limit checking */
24 .globl copy_from_user 43 .globl copy_from_user
@@ -53,44 +72,230 @@ bad_to_user:
53 * rsi source 72 * rsi source
54 * rdx count 73 * rdx count
55 * 74 *
56 * Only 4GB of copy is supported. This shouldn't be a problem
57 * because the kernel normally only writes from/to page sized chunks
58 * even if user space passed a longer buffer.
59 * And more would be dangerous because both Intel and AMD have
60 * errata with rep movsq > 4GB. If someone feels the need to fix
61 * this please consider this.
62 *
63 * Output: 75 * Output:
64 * eax uncopied bytes or 0 if successful. 76 * eax uncopied bytes or 0 if successful.
65 */ 77 */
66
67 .globl copy_user_generic 78 .globl copy_user_generic
79 .p2align 4
68copy_user_generic: 80copy_user_generic:
81 .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
82 .byte 0x66,0x90
831:
84 .section .altinstr_replacement,"ax"
852: .byte 0xe9 /* near jump with 32bit immediate */
86 .long copy_user_generic_c-1b /* offset */
87 .previous
88 .section .altinstructions,"a"
89 .align 8
90 .quad copy_user_generic
91 .quad 2b
92 .byte X86_FEATURE_REP_GOOD
93 .byte 5
94 .byte 5
95 .previous
96.Lcug:
97 pushq %rbx
98 xorl %eax,%eax /*zero for the exception handler */
99
100#ifdef FIX_ALIGNMENT
101 /* check for bad alignment of destination */
102 movl %edi,%ecx
103 andl $7,%ecx
104 jnz .Lbad_alignment
105.Lafter_bad_alignment:
106#endif
107
108 movq %rdx,%rcx
109
110 movl $64,%ebx
111 shrq $6,%rdx
112 decq %rdx
113 js .Lhandle_tail
114
115 .p2align 4
116.Lloop:
117.Ls1: movq (%rsi),%r11
118.Ls2: movq 1*8(%rsi),%r8
119.Ls3: movq 2*8(%rsi),%r9
120.Ls4: movq 3*8(%rsi),%r10
121.Ld1: movq %r11,(%rdi)
122.Ld2: movq %r8,1*8(%rdi)
123.Ld3: movq %r9,2*8(%rdi)
124.Ld4: movq %r10,3*8(%rdi)
125
126.Ls5: movq 4*8(%rsi),%r11
127.Ls6: movq 5*8(%rsi),%r8
128.Ls7: movq 6*8(%rsi),%r9
129.Ls8: movq 7*8(%rsi),%r10
130.Ld5: movq %r11,4*8(%rdi)
131.Ld6: movq %r8,5*8(%rdi)
132.Ld7: movq %r9,6*8(%rdi)
133.Ld8: movq %r10,7*8(%rdi)
134
135 decq %rdx
136
137 leaq 64(%rsi),%rsi
138 leaq 64(%rdi),%rdi
139
140 jns .Lloop
141
142 .p2align 4
143.Lhandle_tail:
144 movl %ecx,%edx
145 andl $63,%ecx
146 shrl $3,%ecx
147 jz .Lhandle_7
148 movl $8,%ebx
149 .p2align 4
150.Lloop_8:
151.Ls9: movq (%rsi),%r8
152.Ld9: movq %r8,(%rdi)
153 decl %ecx
154 leaq 8(%rdi),%rdi
155 leaq 8(%rsi),%rsi
156 jnz .Lloop_8
157
158.Lhandle_7:
159 movl %edx,%ecx
160 andl $7,%ecx
161 jz .Lende
162 .p2align 4
163.Lloop_1:
164.Ls10: movb (%rsi),%bl
165.Ld10: movb %bl,(%rdi)
166 incq %rdi
167 incq %rsi
168 decl %ecx
169 jnz .Lloop_1
170
171.Lende:
172 popq %rbx
173 ret
174
175#ifdef FIX_ALIGNMENT
176 /* align destination */
177 .p2align 4
178.Lbad_alignment:
179 movl $8,%r9d
180 subl %ecx,%r9d
181 movl %r9d,%ecx
182 cmpq %r9,%rdx
183 jz .Lhandle_7
184 js .Lhandle_7
185.Lalign_1:
186.Ls11: movb (%rsi),%bl
187.Ld11: movb %bl,(%rdi)
188 incq %rsi
189 incq %rdi
190 decl %ecx
191 jnz .Lalign_1
192 subq %r9,%rdx
193 jmp .Lafter_bad_alignment
194#endif
195
196 /* table sorted by exception address */
197 .section __ex_table,"a"
198 .align 8
199 .quad .Ls1,.Ls1e
200 .quad .Ls2,.Ls2e
201 .quad .Ls3,.Ls3e
202 .quad .Ls4,.Ls4e
203 .quad .Ld1,.Ls1e
204 .quad .Ld2,.Ls2e
205 .quad .Ld3,.Ls3e
206 .quad .Ld4,.Ls4e
207 .quad .Ls5,.Ls5e
208 .quad .Ls6,.Ls6e
209 .quad .Ls7,.Ls7e
210 .quad .Ls8,.Ls8e
211 .quad .Ld5,.Ls5e
212 .quad .Ld6,.Ls6e
213 .quad .Ld7,.Ls7e
214 .quad .Ld8,.Ls8e
215 .quad .Ls9,.Le_quad
216 .quad .Ld9,.Le_quad
217 .quad .Ls10,.Le_byte
218 .quad .Ld10,.Le_byte
219#ifdef FIX_ALIGNMENT
220 .quad .Ls11,.Lzero_rest
221 .quad .Ld11,.Lzero_rest
222#endif
223 .quad .Le5,.Le_zero
224 .previous
225
226 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
227 pessimistic side. this is gross. it would be better to fix the
228 interface. */
229 /* eax: zero, ebx: 64 */
230.Ls1e: addl $8,%eax
231.Ls2e: addl $8,%eax
232.Ls3e: addl $8,%eax
233.Ls4e: addl $8,%eax
234.Ls5e: addl $8,%eax
235.Ls6e: addl $8,%eax
236.Ls7e: addl $8,%eax
237.Ls8e: addl $8,%eax
238 addq %rbx,%rdi /* +64 */
239 subq %rax,%rdi /* correct destination with computed offset */
240
241 shlq $6,%rdx /* loop counter * 64 (stride length) */
242 addq %rax,%rdx /* add offset to loopcnt */
243 andl $63,%ecx /* remaining bytes */
244 addq %rcx,%rdx /* add them */
245 jmp .Lzero_rest
246
247 /* exception on quad word loop in tail handling */
248 /* ecx: loopcnt/8, %edx: length, rdi: correct */
249.Le_quad:
250 shll $3,%ecx
251 andl $7,%edx
252 addl %ecx,%edx
253 /* edx: bytes to zero, rdi: dest, eax:zero */
254.Lzero_rest:
255 movq %rdx,%rcx
256.Le_byte:
257 xorl %eax,%eax
258.Le5: rep
259 stosb
260 /* when there is another exception while zeroing the rest just return */
261.Le_zero:
262 movq %rdx,%rax
263 jmp .Lende
264
265 /* Some CPUs run faster using the string copy instructions.
266 This is also a lot simpler. Use them when possible.
267 Patch in jmps to this code instead of copying it fully
268 to avoid unwanted aliasing in the exception tables. */
269
270 /* rdi destination
271 * rsi source
272 * rdx count
273 *
274 * Output:
275 * eax uncopied bytes or 0 if successfull.
276 *
277 * Only 4GB of copy is supported. This shouldn't be a problem
278 * because the kernel normally only writes from/to page sized chunks
279 * even if user space passed a longer buffer.
280 * And more would be dangerous because both Intel and AMD have
281 * errata with rep movsq > 4GB. If someone feels the need to fix
282 * this please consider this.
283 */
284copy_user_generic_c:
69 movl %edx,%ecx 285 movl %edx,%ecx
70 shrl $3,%ecx 286 shrl $3,%ecx
71 andl $7,%edx 287 andl $7,%edx
72 jz 5f
731: rep 2881: rep
74 movsq 289 movsq
75 movl %edx,%ecx 290 movl %edx,%ecx
76 xor %eax,%eax
772: rep 2912: rep
78 movsb 292 movsb
2934: movl %ecx,%eax
79 ret 294 ret
80 /* align here? */
815: xorl %eax,%eax
826: rep movsq
83 ret
84
85 .section .fixup,"ax"
863: lea (%rdx,%rcx,8),%rax 2953: lea (%rdx,%rcx,8),%rax
87 ret 296 ret
884: movl %ecx,%eax
89 ret
90 .previous
91 297
92 .section __ex_table,"a" 298 .section __ex_table,"a"
93 .quad 1b,3b 299 .quad 1b,3b
94 .quad 2b,4b 300 .quad 2b,4b
95 .quad 6b,4b
96 .previous 301 .previous
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
index 92dd80544602..5554948b5554 100644
--- a/arch/x86_64/lib/memcpy.S
+++ b/arch/x86_64/lib/memcpy.S
@@ -11,8 +11,6 @@
11 * 11 *
12 * Output: 12 * Output:
13 * rax original destination 13 * rax original destination
14 *
15 * TODO: check best memcpy for PSC
16 */ 14 */
17 15
18 .globl __memcpy 16 .globl __memcpy
@@ -20,6 +18,95 @@
20 .p2align 4 18 .p2align 4
21__memcpy: 19__memcpy:
22memcpy: 20memcpy:
21 pushq %rbx
22 movq %rdi,%rax
23
24 movl %edx,%ecx
25 shrl $6,%ecx
26 jz .Lhandle_tail
27
28 .p2align 4
29.Lloop_64:
30 decl %ecx
31
32 movq (%rsi),%r11
33 movq 8(%rsi),%r8
34
35 movq %r11,(%rdi)
36 movq %r8,1*8(%rdi)
37
38 movq 2*8(%rsi),%r9
39 movq 3*8(%rsi),%r10
40
41 movq %r9,2*8(%rdi)
42 movq %r10,3*8(%rdi)
43
44 movq 4*8(%rsi),%r11
45 movq 5*8(%rsi),%r8
46
47 movq %r11,4*8(%rdi)
48 movq %r8,5*8(%rdi)
49
50 movq 6*8(%rsi),%r9
51 movq 7*8(%rsi),%r10
52
53 movq %r9,6*8(%rdi)
54 movq %r10,7*8(%rdi)
55
56 leaq 64(%rsi),%rsi
57 leaq 64(%rdi),%rdi
58 jnz .Lloop_64
59
60.Lhandle_tail:
61 movl %edx,%ecx
62 andl $63,%ecx
63 shrl $3,%ecx
64 jz .Lhandle_7
65 .p2align 4
66.Lloop_8:
67 decl %ecx
68 movq (%rsi),%r8
69 movq %r8,(%rdi)
70 leaq 8(%rdi),%rdi
71 leaq 8(%rsi),%rsi
72 jnz .Lloop_8
73
74.Lhandle_7:
75 movl %edx,%ecx
76 andl $7,%ecx
77 jz .Lende
78 .p2align 4
79.Lloop_1:
80 movb (%rsi),%r8b
81 movb %r8b,(%rdi)
82 incq %rdi
83 incq %rsi
84 decl %ecx
85 jnz .Lloop_1
86
87.Lende:
88 popq %rbx
89 ret
90.Lfinal:
91
92 /* Some CPUs run faster using the string copy instructions.
93 It is also a lot simpler. Use this when possible */
94
95 .section .altinstructions,"a"
96 .align 8
97 .quad memcpy
98 .quad memcpy_c
99 .byte X86_FEATURE_REP_GOOD
100 .byte .Lfinal-memcpy
101 .byte memcpy_c_end-memcpy_c
102 .previous
103
104 .section .altinstr_replacement,"ax"
105 /* rdi destination
106 * rsi source
107 * rdx count
108 */
109memcpy_c:
23 movq %rdi,%rax 110 movq %rdi,%rax
24 movl %edx,%ecx 111 movl %edx,%ecx
25 shrl $3,%ecx 112 shrl $3,%ecx
@@ -30,3 +117,5 @@ memcpy:
30 rep 117 rep
31 movsb 118 movsb
32 ret 119 ret
120memcpy_c_end:
121 .previous
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2aa48f24ed1e..ad397f2c7de8 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,6 +13,98 @@
13 .p2align 4 13 .p2align 4
14memset: 14memset:
15__memset: 15__memset:
16 movq %rdi,%r10
17 movq %rdx,%r11
18
19 /* expand byte value */
20 movzbl %sil,%ecx
21 movabs $0x0101010101010101,%rax
22 mul %rcx /* with rax, clobbers rdx */
23
24 /* align dst */
25 movl %edi,%r9d
26 andl $7,%r9d
27 jnz .Lbad_alignment
28.Lafter_bad_alignment:
29
30 movl %r11d,%ecx
31 shrl $6,%ecx
32 jz .Lhandle_tail
33
34 .p2align 4
35.Lloop_64:
36 decl %ecx
37 movq %rax,(%rdi)
38 movq %rax,8(%rdi)
39 movq %rax,16(%rdi)
40 movq %rax,24(%rdi)
41 movq %rax,32(%rdi)
42 movq %rax,40(%rdi)
43 movq %rax,48(%rdi)
44 movq %rax,56(%rdi)
45 leaq 64(%rdi),%rdi
46 jnz .Lloop_64
47
48 /* Handle tail in loops. The loops should be faster than hard
49 to predict jump tables. */
50 .p2align 4
51.Lhandle_tail:
52 movl %r11d,%ecx
53 andl $63&(~7),%ecx
54 jz .Lhandle_7
55 shrl $3,%ecx
56 .p2align 4
57.Lloop_8:
58 decl %ecx
59 movq %rax,(%rdi)
60 leaq 8(%rdi),%rdi
61 jnz .Lloop_8
62
63.Lhandle_7:
64 movl %r11d,%ecx
65 andl $7,%ecx
66 jz .Lende
67 .p2align 4
68.Lloop_1:
69 decl %ecx
70 movb %al,(%rdi)
71 leaq 1(%rdi),%rdi
72 jnz .Lloop_1
73
74.Lende:
75 movq %r10,%rax
76 ret
77
78.Lbad_alignment:
79 cmpq $7,%r11
80 jbe .Lhandle_7
81 movq %rax,(%rdi) /* unaligned store */
82 movq $8,%r8
83 subq %r9,%r8
84 addq %r8,%rdi
85 subq %r8,%r11
86 jmp .Lafter_bad_alignment
87
88 /* Some CPUs run faster using the string instructions.
89 It is also a lot simpler. Use this when possible */
90
91#include <asm/cpufeature.h>
92
93 .section .altinstructions,"a"
94 .align 8
95 .quad memset
96 .quad memset_c
97 .byte X86_FEATURE_REP_GOOD
98 .byte memset_c_end-memset_c
99 .byte memset_c_end-memset_c
100 .previous
101
102 .section .altinstr_replacement,"ax"
103 /* rdi destination
104 * rsi value
105 * rdx count
106 */
107memset_c:
16 movq %rdi,%r9 108 movq %rdi,%r9
17 movl %edx,%r8d 109 movl %edx,%r8d
18 andl $7,%r8d 110 andl $7,%r8d
@@ -29,3 +121,5 @@ __memset:
29 stosb 121 stosb
30 movq %r9,%rax 122 movq %r9,%rax
31 ret 123 ret
124memset_c_end:
125 .previous