[PATCH] x86_64: Clean up copy_*_user

- Remove optimization for old B stepping Opteron - Make the fast path for copies with a multiple of eight length faster. - Minor instruction rearrangement to hopefully avoid a pipeline stall or two. - Add comment about errata to consider. Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Andi Kleen <ak@suse.de> 2006-01-11 16:44:45 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-11 22:04:55 -0500
commit: 2cbc9ee35c992f88fb2558a4f13920e3df4ef3bb (patch)
tree: 51bed27fe9050cae3e7de2d24f55874455316ab2 /arch/x86_64
parent: 17a941d854a3f7b0bb916fdeee4c9ffdcc19d8d3 (diff)
1 files changed, 23 insertions, 221 deletions
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
index dfa358b05c89..79422b6559c3 100644
--- a/arch/x86_64/lib/copy_user.S
+++ b/arch/x86_64/lib/copy_user.S
@@ -4,12 +4,9 @@
 * Functions to copy from and to user space.            
 */              
-#define FIX_ALIGNMENT 1
-                
        #include <asm/current.h>
        #include <asm/asm-offsets.h>
        #include <asm/thread_info.h>
-        #include <asm/cpufeature.h>
 /* Standard copy_to_user with segment limit checking */         
        .globl copy_to_user
@@ -21,23 +18,7 @@ copy_to_user:
        jc  bad_to_user
        cmpq threadinfo_addr_limit(%rax),%rcx
        jae bad_to_user
-2:      
+        jmp copy_user_generic
-        .byte 0xe9      /* 32bit jump */
-        .long .Lcug-1f
-1:
-        .section .altinstr_replacement,"ax"
-3:      .byte 0xe9                      /* replacement jmp with 8 bit immediate */
-        .long copy_user_generic_c-1b    /* offset */
-        .previous
-        .section .altinstructions,"a"
-        .align 8
-        .quad  2b
-        .quad  3b
-        .byte  X86_FEATURE_K8_C
-        .byte  5
-        .byte  5
-        .previous
 /* Standard copy_from_user with segment limit checking */       
        .globl copy_from_user
@@ -72,223 +53,44 @@ bad_to_user:
 * rsi source
 * rdx count
 *
+ * Only 4GB of copy is supported. This shouldn't be a problem
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
+ * And more would be dangerous because both Intel and AMD have
+ * errata with rep movsq > 4GB. If someone feels the need to fix
+ * this please consider this.
+ *
 * Output:              
 * eax uncopied bytes or 0 if successful.
 */
-        .globl copy_user_generic        
-        .p2align 4
-copy_user_generic:      
-        .byte 0x66,0x66,0x90    /* 5 byte nop for replacement jump */   
-        .byte 0x66,0x90
-1:              
-        .section .altinstr_replacement,"ax"
-2:      .byte 0xe9                   /* near jump with 32bit immediate */
-        .long copy_user_generic_c-1b /* offset */
-        .previous
-        .section .altinstructions,"a"
-        .align 8
-        .quad  copy_user_generic
-        .quad  2b
-        .byte  X86_FEATURE_K8_C
-        .byte  5
-        .byte  5
-        .previous
-.Lcug:  
-        pushq %rbx
-        xorl %eax,%eax          /*zero for the exception handler */
-#ifdef FIX_ALIGNMENT
-        /* check for bad alignment of destination */
-        movl %edi,%ecx
-        andl $7,%ecx
-        jnz  .Lbad_alignment
-.Lafter_bad_alignment:
-#endif
-        movq %rdx,%rcx
+        .globl copy_user_generic
+copy_user_generic:
-        movl $64,%ebx   
-        shrq $6,%rdx
-        decq %rdx
-        js   .Lhandle_tail
-        
-        .p2align 4
-.Lloop:
-.Ls1:   movq (%rsi),%r11
-.Ls2:   movq 1*8(%rsi),%r8
-.Ls3:   movq 2*8(%rsi),%r9
-.Ls4:   movq 3*8(%rsi),%r10
-.Ld1:   movq %r11,(%rdi)
-.Ld2:   movq %r8,1*8(%rdi)
-.Ld3:   movq %r9,2*8(%rdi)
-.Ld4:   movq %r10,3*8(%rdi)
-                
-.Ls5:   movq 4*8(%rsi),%r11
-.Ls6:   movq 5*8(%rsi),%r8
-.Ls7:   movq 6*8(%rsi),%r9
-.Ls8:   movq 7*8(%rsi),%r10
-.Ld5:   movq %r11,4*8(%rdi)
-.Ld6:   movq %r8,5*8(%rdi)
-.Ld7:   movq %r9,6*8(%rdi)
-.Ld8:   movq %r10,7*8(%rdi)
-        
-        decq %rdx
-        leaq 64(%rsi),%rsi
-        leaq 64(%rdi),%rdi
-        
-        jns  .Lloop
-        .p2align 4
-.Lhandle_tail:
-        movl %ecx,%edx
-        andl $63,%ecx
-        shrl $3,%ecx
-        jz   .Lhandle_7
-        movl $8,%ebx
-        .p2align 4
-.Lloop_8:
-.Ls9:   movq (%rsi),%r8
-.Ld9:   movq %r8,(%rdi)
-        decl %ecx
-        leaq 8(%rdi),%rdi
-        leaq 8(%rsi),%rsi
-        jnz .Lloop_8
-        
-.Lhandle_7:             
-        movl %edx,%ecx  
-        andl $7,%ecx
-        jz   .Lende
-        .p2align 4
-.Lloop_1:
-.Ls10:  movb (%rsi),%bl
-.Ld10:  movb %bl,(%rdi)
-        incq %rdi
-        incq %rsi
-        decl %ecx
-        jnz .Lloop_1
-                        
-.Lende:
-        popq %rbx
-        ret     
-#ifdef FIX_ALIGNMENT                            
-        /* align destination */
-        .p2align 4
-.Lbad_alignment:
-        movl $8,%r9d
-        subl %ecx,%r9d
-        movl %r9d,%ecx
-        cmpq %r9,%rdx
-        jz   .Lhandle_7
-        js   .Lhandle_7
-.Lalign_1:              
-.Ls11:  movb (%rsi),%bl
-.Ld11:  movb %bl,(%rdi)
-        incq %rsi
-        incq %rdi
-        decl %ecx
-        jnz .Lalign_1
-        subq %r9,%rdx
-        jmp .Lafter_bad_alignment
-#endif
-        
-        /* table sorted by exception address */ 
-        .section __ex_table,"a"
-        .align 8
-        .quad .Ls1,.Ls1e
-        .quad .Ls2,.Ls2e
-        .quad .Ls3,.Ls3e
-        .quad .Ls4,.Ls4e        
-        .quad .Ld1,.Ls1e
-        .quad .Ld2,.Ls2e
-        .quad .Ld3,.Ls3e
-        .quad .Ld4,.Ls4e
-        .quad .Ls5,.Ls5e
-        .quad .Ls6,.Ls6e
-        .quad .Ls7,.Ls7e
-        .quad .Ls8,.Ls8e        
-        .quad .Ld5,.Ls5e
-        .quad .Ld6,.Ls6e
-        .quad .Ld7,.Ls7e
-        .quad .Ld8,.Ls8e
-        .quad .Ls9,.Le_quad
-        .quad .Ld9,.Le_quad
-        .quad .Ls10,.Le_byte
-        .quad .Ld10,.Le_byte
-#ifdef FIX_ALIGNMENT    
-        .quad .Ls11,.Lzero_rest
-        .quad .Ld11,.Lzero_rest
-#endif
-        .quad .Le5,.Le_zero
-        .previous
-        /* compute 64-offset for main loop. 8 bytes accuracy with error on the 
-           pessimistic side. this is gross. it would be better to fix the 
-           interface. */        
-        /* eax: zero, ebx: 64 */
-.Ls1e:  addl $8,%eax
-.Ls2e:  addl $8,%eax
-.Ls3e:  addl $8,%eax
-.Ls4e:  addl $8,%eax
-.Ls5e:  addl $8,%eax
-.Ls6e:  addl $8,%eax
-.Ls7e:  addl $8,%eax
-.Ls8e:  addl $8,%eax
-        addq %rbx,%rdi  /* +64 */
-        subq %rax,%rdi  /* correct destination with computed offset */
-        shlq $6,%rdx    /* loop counter * 64 (stride length) */
-        addq %rax,%rdx  /* add offset to loopcnt */
-        andl $63,%ecx   /* remaining bytes */
-        addq %rcx,%rdx  /* add them */
-        jmp .Lzero_rest
-        /* exception on quad word loop in tail handling */
-        /* ecx: loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
-        shll $3,%ecx
-        andl $7,%edx
-        addl %ecx,%edx
-        /* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
-        movq %rdx,%rcx
-.Le_byte:
-        xorl %eax,%eax
-.Le5:   rep 
-        stosb
-        /* when there is another exception while zeroing the rest just return */
-.Le_zero:               
-        movq %rdx,%rax
-        jmp .Lende
-        /* C stepping K8 run faster using the string copy instructions.
-           This is also a lot simpler. Use them when possible.
-           Patch in jmps to this code instead of copying it fully
-           to avoid unwanted aliasing in the exception tables. */
-                
- /* rdi destination
-  * rsi source
-  * rdx count
-  *
-  * Output:             
-  * eax uncopied bytes or 0 if successfull.
-  */                    
-copy_user_generic_c:
        movl %edx,%ecx
        shrl $3,%ecx
        andl $7,%edx    
+        jz   5f
 1:      rep 
        movsq 
        movl %edx,%ecx
+        xor %eax,%eax
 2:      rep
        movsb
-4:      movl %ecx,%eax
        ret
+        /* align here? */
+5:      xorl %eax,%eax
+6:      rep movsq
+        ret
+        .section .fixup,"ax"
 3:      lea (%rdx,%rcx,8),%rax
        ret
-        
+4:      movl %ecx,%eax
+        ret
+        .previous
        .section __ex_table,"a"
        .quad 1b,3b
        .quad 2b,4b
+        .quad 6b,4b
        .previous
author	Andi Kleen <ak@suse.de>	2006-01-11 16:44:45 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-11 22:04:55 -0500
commit	2cbc9ee35c992f88fb2558a4f13920e3df4ef3bb (patch)
tree	51bed27fe9050cae3e7de2d24f55874455316ab2 /arch/x86_64
parent	17a941d854a3f7b0bb916fdeee4c9ffdcc19d8d3 (diff)

diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S index dfa358b05c89..79422b6559c3 100644 --- a/arch/x86_64/lib/copy_user.S +++ b/arch/x86_64/lib/copy_user.S
@@ -4,12 +4,9 @@
4	* Functions to copy from and to user space.	4	* Functions to copy from and to user space.
5	*/	5	*/
6		6
7	#define FIX_ALIGNMENT 1
8
9	#include <asm/current.h>	7	#include <asm/current.h>
10	#include <asm/asm-offsets.h>	8	#include <asm/asm-offsets.h>
11	#include <asm/thread_info.h>	9	#include <asm/thread_info.h>
12	#include <asm/cpufeature.h>
13		10
14	/* Standard copy_to_user with segment limit checking */	11	/* Standard copy_to_user with segment limit checking */
15	.globl copy_to_user	12	.globl copy_to_user
@@ -21,23 +18,7 @@ copy_to_user:
21	jc bad_to_user	18	jc bad_to_user
22	cmpq threadinfo_addr_limit(%rax),%rcx	19	cmpq threadinfo_addr_limit(%rax),%rcx
23	jae bad_to_user	20	jae bad_to_user
24	2:	21	jmp copy_user_generic
25	.byte 0xe9 /* 32bit jump */
26	.long .Lcug-1f
27	1:
28
29	.section .altinstr_replacement,"ax"
30	3: .byte 0xe9 /* replacement jmp with 8 bit immediate */
31	.long copy_user_generic_c-1b /* offset */
32	.previous
33	.section .altinstructions,"a"
34	.align 8
35	.quad 2b
36	.quad 3b
37	.byte X86_FEATURE_K8_C
38	.byte 5
39	.byte 5
40	.previous
41		22
42	/* Standard copy_from_user with segment limit checking */	23	/* Standard copy_from_user with segment limit checking */
43	.globl copy_from_user	24	.globl copy_from_user
@@ -72,223 +53,44 @@ bad_to_user:
72	* rsi source	53	* rsi source
73	* rdx count	54	* rdx count
74	*	55	*
		56	* Only 4GB of copy is supported. This shouldn't be a problem
		57	* because the kernel normally only writes from/to page sized chunks
		58	* even if user space passed a longer buffer.
		59	* And more would be dangerous because both Intel and AMD have
		60	* errata with rep movsq > 4GB. If someone feels the need to fix
		61	* this please consider this.
		62	*
75	* Output:	63	* Output:
76	* eax uncopied bytes or 0 if successful.	64	* eax uncopied bytes or 0 if successful.
77	*/	65	*/
78	.globl copy_user_generic
79	.p2align 4
80	copy_user_generic:
81	.byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
82	.byte 0x66,0x90
83	1:
84	.section .altinstr_replacement,"ax"
85	2: .byte 0xe9 /* near jump with 32bit immediate */
86	.long copy_user_generic_c-1b /* offset */
87	.previous
88	.section .altinstructions,"a"
89	.align 8
90	.quad copy_user_generic
91	.quad 2b
92	.byte X86_FEATURE_K8_C
93	.byte 5
94	.byte 5
95	.previous
96	.Lcug:
97	pushq %rbx
98	xorl %eax,%eax /zero for the exception handler /
99
100	#ifdef FIX_ALIGNMENT
101	/* check for bad alignment of destination */
102	movl %edi,%ecx
103	andl $7,%ecx
104	jnz .Lbad_alignment
105	.Lafter_bad_alignment:
106	#endif
107		66
108	movq %rdx,%rcx	67	.globl copy_user_generic
109		68	copy_user_generic:
110	movl $64,%ebx
111	shrq $6,%rdx
112	decq %rdx
113	js .Lhandle_tail
114
115	.p2align 4
116	.Lloop:
117	.Ls1: movq (%rsi),%r11
118	.Ls2: movq 1*8(%rsi),%r8
119	.Ls3: movq 2*8(%rsi),%r9
120	.Ls4: movq 3*8(%rsi),%r10
121	.Ld1: movq %r11,(%rdi)
122	.Ld2: movq %r8,1*8(%rdi)
123	.Ld3: movq %r9,2*8(%rdi)
124	.Ld4: movq %r10,3*8(%rdi)
125
126	.Ls5: movq 4*8(%rsi),%r11
127	.Ls6: movq 5*8(%rsi),%r8
128	.Ls7: movq 6*8(%rsi),%r9
129	.Ls8: movq 7*8(%rsi),%r10
130	.Ld5: movq %r11,4*8(%rdi)
131	.Ld6: movq %r8,5*8(%rdi)
132	.Ld7: movq %r9,6*8(%rdi)
133	.Ld8: movq %r10,7*8(%rdi)
134
135	decq %rdx
136
137	leaq 64(%rsi),%rsi
138	leaq 64(%rdi),%rdi
139
140	jns .Lloop
141
142	.p2align 4
143	.Lhandle_tail:
144	movl %ecx,%edx
145	andl $63,%ecx
146	shrl $3,%ecx
147	jz .Lhandle_7
148	movl $8,%ebx
149	.p2align 4
150	.Lloop_8:
151	.Ls9: movq (%rsi),%r8
152	.Ld9: movq %r8,(%rdi)
153	decl %ecx
154	leaq 8(%rdi),%rdi
155	leaq 8(%rsi),%rsi
156	jnz .Lloop_8
157
158	.Lhandle_7:
159	movl %edx,%ecx
160	andl $7,%ecx
161	jz .Lende
162	.p2align 4
163	.Lloop_1:
164	.Ls10: movb (%rsi),%bl
165	.Ld10: movb %bl,(%rdi)
166	incq %rdi
167	incq %rsi
168	decl %ecx
169	jnz .Lloop_1
170
171	.Lende:
172	popq %rbx
173	ret
174
175	#ifdef FIX_ALIGNMENT
176	/* align destination */
177	.p2align 4
178	.Lbad_alignment:
179	movl $8,%r9d
180	subl %ecx,%r9d
181	movl %r9d,%ecx
182	cmpq %r9,%rdx
183	jz .Lhandle_7
184	js .Lhandle_7
185	.Lalign_1:
186	.Ls11: movb (%rsi),%bl
187	.Ld11: movb %bl,(%rdi)
188	incq %rsi
189	incq %rdi
190	decl %ecx
191	jnz .Lalign_1
192	subq %r9,%rdx
193	jmp .Lafter_bad_alignment
194	#endif
195
196	/* table sorted by exception address */
197	.section __ex_table,"a"
198	.align 8
199	.quad .Ls1,.Ls1e
200	.quad .Ls2,.Ls2e
201	.quad .Ls3,.Ls3e
202	.quad .Ls4,.Ls4e
203	.quad .Ld1,.Ls1e
204	.quad .Ld2,.Ls2e
205	.quad .Ld3,.Ls3e
206	.quad .Ld4,.Ls4e
207	.quad .Ls5,.Ls5e
208	.quad .Ls6,.Ls6e
209	.quad .Ls7,.Ls7e
210	.quad .Ls8,.Ls8e
211	.quad .Ld5,.Ls5e
212	.quad .Ld6,.Ls6e
213	.quad .Ld7,.Ls7e
214	.quad .Ld8,.Ls8e
215	.quad .Ls9,.Le_quad
216	.quad .Ld9,.Le_quad
217	.quad .Ls10,.Le_byte
218	.quad .Ld10,.Le_byte
219	#ifdef FIX_ALIGNMENT
220	.quad .Ls11,.Lzero_rest
221	.quad .Ld11,.Lzero_rest
222	#endif
223	.quad .Le5,.Le_zero
224	.previous
225
226	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
227	pessimistic side. this is gross. it would be better to fix the
228	interface. */
229	/* eax: zero, ebx: 64 */
230	.Ls1e: addl $8,%eax
231	.Ls2e: addl $8,%eax
232	.Ls3e: addl $8,%eax
233	.Ls4e: addl $8,%eax
234	.Ls5e: addl $8,%eax
235	.Ls6e: addl $8,%eax
236	.Ls7e: addl $8,%eax
237	.Ls8e: addl $8,%eax
238	addq %rbx,%rdi /* +64 */
239	subq %rax,%rdi /* correct destination with computed offset */
240
241	shlq $6,%rdx /* loop counter * 64 (stride length) */
242	addq %rax,%rdx /* add offset to loopcnt */
243	andl $63,%ecx /* remaining bytes */
244	addq %rcx,%rdx /* add them */
245	jmp .Lzero_rest
246
247	/* exception on quad word loop in tail handling */
248	/* ecx: loopcnt/8, %edx: length, rdi: correct */
249	.Le_quad:
250	shll $3,%ecx
251	andl $7,%edx
252	addl %ecx,%edx
253	/* edx: bytes to zero, rdi: dest, eax:zero */
254	.Lzero_rest:
255	movq %rdx,%rcx
256	.Le_byte:
257	xorl %eax,%eax
258	.Le5: rep
259	stosb
260	/* when there is another exception while zeroing the rest just return */
261	.Le_zero:
262	movq %rdx,%rax
263	jmp .Lende
264
265	/* C stepping K8 run faster using the string copy instructions.
266	This is also a lot simpler. Use them when possible.
267	Patch in jmps to this code instead of copying it fully
268	to avoid unwanted aliasing in the exception tables. */
269
270	/* rdi destination
271	* rsi source
272	* rdx count
273	*
274	* Output:
275	* eax uncopied bytes or 0 if successfull.
276	*/
277	copy_user_generic_c:
278	movl %edx,%ecx	69	movl %edx,%ecx
279	shrl $3,%ecx	70	shrl $3,%ecx
280	andl $7,%edx	71	andl $7,%edx
		72	jz 5f
281	1: rep	73	1: rep
282	movsq	74	movsq
283	movl %edx,%ecx	75	movl %edx,%ecx
		76	xor %eax,%eax
284	2: rep	77	2: rep
285	movsb	78	movsb
286	4: movl %ecx,%eax
287	ret	79	ret
		80	/* align here? */
		81	5: xorl %eax,%eax
		82	6: rep movsq
		83	ret
		84
		85	.section .fixup,"ax"
288	3: lea (%rdx,%rcx,8),%rax	86	3: lea (%rdx,%rcx,8),%rax
289	ret	87	ret
290		88	4: movl %ecx,%eax
		89	ret
		90	.previous
		91
291	.section __ex_table,"a"	92	.section __ex_table,"a"
292	.quad 1b,3b	93	.quad 1b,3b
293	.quad 2b,4b	94	.quad 2b,4b
		95	.quad 6b,4b
294	.previous	96	.previous