aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2006-01-11 16:44:45 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-11 22:04:55 -0500
commit2cbc9ee35c992f88fb2558a4f13920e3df4ef3bb (patch)
tree51bed27fe9050cae3e7de2d24f55874455316ab2 /arch
parent17a941d854a3f7b0bb916fdeee4c9ffdcc19d8d3 (diff)
[PATCH] x86_64: Clean up copy_*_user
- Remove optimization for old B stepping Opteron - Make the fast path for copies with a multiple of eight length faster. - Minor instruction rearrangement to hopefully avoid a pipeline stall or two. - Add comment about errata to consider. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86_64/lib/copy_user.S244
1 files changed, 23 insertions, 221 deletions
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index dfa358b05c89..79422b6559c3 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -4,12 +4,9 @@
4 * Functions to copy from and to user space. 4 * Functions to copy from and to user space.
5 */ 5 */
6 6
7#define FIX_ALIGNMENT 1
8
9 #include <asm/current.h> 7 #include <asm/current.h>
10 #include <asm/asm-offsets.h> 8 #include <asm/asm-offsets.h>
11 #include <asm/thread_info.h> 9 #include <asm/thread_info.h>
12 #include <asm/cpufeature.h>
13 10
14/* Standard copy_to_user with segment limit checking */ 11/* Standard copy_to_user with segment limit checking */
15 .globl copy_to_user 12 .globl copy_to_user
@@ -21,23 +18,7 @@ copy_to_user:
21 jc bad_to_user 18 jc bad_to_user
22 cmpq threadinfo_addr_limit(%rax),%rcx 19 cmpq threadinfo_addr_limit(%rax),%rcx
23 jae bad_to_user 20 jae bad_to_user
242: 21 jmp copy_user_generic
25 .byte 0xe9 /* 32bit jump */
26 .long .Lcug-1f
271:
28
29 .section .altinstr_replacement,"ax"
303: .byte 0xe9 /* replacement jmp with 8 bit immediate */
31 .long copy_user_generic_c-1b /* offset */
32 .previous
33 .section .altinstructions,"a"
34 .align 8
35 .quad 2b
36 .quad 3b
37 .byte X86_FEATURE_K8_C
38 .byte 5
39 .byte 5
40 .previous
41 22
42/* Standard copy_from_user with segment limit checking */ 23/* Standard copy_from_user with segment limit checking */
43 .globl copy_from_user 24 .globl copy_from_user
@@ -72,223 +53,44 @@ bad_to_user:
72 * rsi source 53 * rsi source
73 * rdx count 54 * rdx count
74 * 55 *
56 * Only 4GB of copy is supported. This shouldn't be a problem
57 * because the kernel normally only writes from/to page sized chunks
58 * even if user space passed a longer buffer.
59 * And more would be dangerous because both Intel and AMD have
60 * errata with rep movsq > 4GB. If someone feels the need to fix
61 * this please consider this.
62 *
75 * Output: 63 * Output:
76 * eax uncopied bytes or 0 if successful. 64 * eax uncopied bytes or 0 if successful.
77 */ 65 */
78 .globl copy_user_generic
79 .p2align 4
80copy_user_generic:
81 .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
82 .byte 0x66,0x90
831:
84 .section .altinstr_replacement,"ax"
852: .byte 0xe9 /* near jump with 32bit immediate */
86 .long copy_user_generic_c-1b /* offset */
87 .previous
88 .section .altinstructions,"a"
89 .align 8
90 .quad copy_user_generic
91 .quad 2b
92 .byte X86_FEATURE_K8_C
93 .byte 5
94 .byte 5
95 .previous
96.Lcug:
97 pushq %rbx
98 xorl %eax,%eax /*zero for the exception handler */
99
100#ifdef FIX_ALIGNMENT
101 /* check for bad alignment of destination */
102 movl %edi,%ecx
103 andl $7,%ecx
104 jnz .Lbad_alignment
105.Lafter_bad_alignment:
106#endif
107 66
108 movq %rdx,%rcx 67 .globl copy_user_generic
109 68copy_user_generic:
110 movl $64,%ebx
111 shrq $6,%rdx
112 decq %rdx
113 js .Lhandle_tail
114
115 .p2align 4
116.Lloop:
117.Ls1: movq (%rsi),%r11
118.Ls2: movq 1*8(%rsi),%r8
119.Ls3: movq 2*8(%rsi),%r9
120.Ls4: movq 3*8(%rsi),%r10
121.Ld1: movq %r11,(%rdi)
122.Ld2: movq %r8,1*8(%rdi)
123.Ld3: movq %r9,2*8(%rdi)
124.Ld4: movq %r10,3*8(%rdi)
125
126.Ls5: movq 4*8(%rsi),%r11
127.Ls6: movq 5*8(%rsi),%r8
128.Ls7: movq 6*8(%rsi),%r9
129.Ls8: movq 7*8(%rsi),%r10
130.Ld5: movq %r11,4*8(%rdi)
131.Ld6: movq %r8,5*8(%rdi)
132.Ld7: movq %r9,6*8(%rdi)
133.Ld8: movq %r10,7*8(%rdi)
134
135 decq %rdx
136
137 leaq 64(%rsi),%rsi
138 leaq 64(%rdi),%rdi
139
140 jns .Lloop
141
142 .p2align 4
143.Lhandle_tail:
144 movl %ecx,%edx
145 andl $63,%ecx
146 shrl $3,%ecx
147 jz .Lhandle_7
148 movl $8,%ebx
149 .p2align 4
150.Lloop_8:
151.Ls9: movq (%rsi),%r8
152.Ld9: movq %r8,(%rdi)
153 decl %ecx
154 leaq 8(%rdi),%rdi
155 leaq 8(%rsi),%rsi
156 jnz .Lloop_8
157
158.Lhandle_7:
159 movl %edx,%ecx
160 andl $7,%ecx
161 jz .Lende
162 .p2align 4
163.Lloop_1:
164.Ls10: movb (%rsi),%bl
165.Ld10: movb %bl,(%rdi)
166 incq %rdi
167 incq %rsi
168 decl %ecx
169 jnz .Lloop_1
170
171.Lende:
172 popq %rbx
173 ret
174
175#ifdef FIX_ALIGNMENT
176 /* align destination */
177 .p2align 4
178.Lbad_alignment:
179 movl $8,%r9d
180 subl %ecx,%r9d
181 movl %r9d,%ecx
182 cmpq %r9,%rdx
183 jz .Lhandle_7
184 js .Lhandle_7
185.Lalign_1:
186.Ls11: movb (%rsi),%bl
187.Ld11: movb %bl,(%rdi)
188 incq %rsi
189 incq %rdi
190 decl %ecx
191 jnz .Lalign_1
192 subq %r9,%rdx
193 jmp .Lafter_bad_alignment
194#endif
195
196 /* table sorted by exception address */
197 .section __ex_table,"a"
198 .align 8
199 .quad .Ls1,.Ls1e
200 .quad .Ls2,.Ls2e
201 .quad .Ls3,.Ls3e
202 .quad .Ls4,.Ls4e
203 .quad .Ld1,.Ls1e
204 .quad .Ld2,.Ls2e
205 .quad .Ld3,.Ls3e
206 .quad .Ld4,.Ls4e
207 .quad .Ls5,.Ls5e
208 .quad .Ls6,.Ls6e
209 .quad .Ls7,.Ls7e
210 .quad .Ls8,.Ls8e
211 .quad .Ld5,.Ls5e
212 .quad .Ld6,.Ls6e
213 .quad .Ld7,.Ls7e
214 .quad .Ld8,.Ls8e
215 .quad .Ls9,.Le_quad
216 .quad .Ld9,.Le_quad
217 .quad .Ls10,.Le_byte
218 .quad .Ld10,.Le_byte
219#ifdef FIX_ALIGNMENT
220 .quad .Ls11,.Lzero_rest
221 .quad .Ld11,.Lzero_rest
222#endif
223 .quad .Le5,.Le_zero
224 .previous
225
226 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
227 pessimistic side. this is gross. it would be better to fix the
228 interface. */
229 /* eax: zero, ebx: 64 */
230.Ls1e: addl $8,%eax
231.Ls2e: addl $8,%eax
232.Ls3e: addl $8,%eax
233.Ls4e: addl $8,%eax
234.Ls5e: addl $8,%eax
235.Ls6e: addl $8,%eax
236.Ls7e: addl $8,%eax
237.Ls8e: addl $8,%eax
238 addq %rbx,%rdi /* +64 */
239 subq %rax,%rdi /* correct destination with computed offset */
240
241 shlq $6,%rdx /* loop counter * 64 (stride length) */
242 addq %rax,%rdx /* add offset to loopcnt */
243 andl $63,%ecx /* remaining bytes */
244 addq %rcx,%rdx /* add them */
245 jmp .Lzero_rest
246
247 /* exception on quad word loop in tail handling */
248 /* ecx: loopcnt/8, %edx: length, rdi: correct */
249.Le_quad:
250 shll $3,%ecx
251 andl $7,%edx
252 addl %ecx,%edx
253 /* edx: bytes to zero, rdi: dest, eax:zero */
254.Lzero_rest:
255 movq %rdx,%rcx
256.Le_byte:
257 xorl %eax,%eax
258.Le5: rep
259 stosb
260 /* when there is another exception while zeroing the rest just return */
261.Le_zero:
262 movq %rdx,%rax
263 jmp .Lende
264
265 /* C stepping K8 run faster using the string copy instructions.
266 This is also a lot simpler. Use them when possible.
267 Patch in jmps to this code instead of copying it fully
268 to avoid unwanted aliasing in the exception tables. */
269
270 /* rdi destination
271 * rsi source
272 * rdx count
273 *
274 * Output:
275 * eax uncopied bytes or 0 if successfull.
276 */
277copy_user_generic_c:
278 movl %edx,%ecx 69 movl %edx,%ecx
279 shrl $3,%ecx 70 shrl $3,%ecx
280 andl $7,%edx 71 andl $7,%edx
72 jz 5f
2811: rep 731: rep
282 movsq 74 movsq
283 movl %edx,%ecx 75 movl %edx,%ecx
76 xor %eax,%eax
2842: rep 772: rep
285 movsb 78 movsb
2864: movl %ecx,%eax
287 ret 79 ret
80 /* align here? */
815: xorl %eax,%eax
826: rep movsq
83 ret
84
85 .section .fixup,"ax"
2883: lea (%rdx,%rcx,8),%rax 863: lea (%rdx,%rcx,8),%rax
289 ret 87 ret
290 884: movl %ecx,%eax
89 ret
90 .previous
91
291 .section __ex_table,"a" 92 .section __ex_table,"a"
292 .quad 1b,3b 93 .quad 1b,3b
293 .quad 2b,4b 94 .quad 2b,4b
95 .quad 6b,4b
294 .previous 96 .previous