1 files changed, 173 insertions, 259 deletions
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 70bebd310408..dfdf428975c0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -1,8 +1,10 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs.
+/*
+ * Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
+ * Copyright 2002 Andi Kleen, SuSE Labs.
 * Subject to the GNU Public License v2.
- * 
+ *
- * Functions to copy from and to user space.            
+ * Functions to copy from and to user space.
- */              
+ */
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
@@ -20,60 +22,88 @@
        .long \orig-1f  /* by default jump to orig */
 1:
        .section .altinstr_replacement,"ax"
-2:      .byte 0xe9                   /* near jump with 32bit immediate */
+2:      .byte 0xe9                      /* near jump with 32bit immediate */
        .long \alt-1b /* offset */   /* or alternatively to alt */
        .previous
        .section .altinstructions,"a"
        .align 8
        .quad  0b
        .quad  2b
-        .byte  \feature              /* when feature is set */
+        .byte  \feature                 /* when feature is set */
        .byte  5
        .byte  5
        .previous
        .endm
-/* Standard copy_to_user with segment limit checking */         
+        .macro ALIGN_DESTINATION
+#ifdef FIX_ALIGNMENT
+        /* check for bad alignment of destination */
+        movl %edi,%ecx
+        andl $7,%ecx
+        jz 102f                         /* already aligned */
+        subl $8,%ecx
+        negl %ecx
+        subl %ecx,%edx
+100:    movb (%rsi),%al
+101:    movb %al,(%rdi)
+        incq %rsi
+        incq %rdi
+        decl %ecx
+        jnz 100b
+102:
+        .section .fixup,"ax"
+103:    addl %r8d,%edx                  /* ecx is zerorest also */
+        jmp copy_user_handle_tail
+        .previous
+        .section __ex_table,"a"
+        .align 8
+        .quad 100b,103b
+        .quad 101b,103b
+        .previous
+#endif
+        .endm
+/* Standard copy_to_user with segment limit checking */
 ENTRY(copy_to_user)
        CFI_STARTPROC
        GET_THREAD_INFO(%rax)
        movq %rdi,%rcx
        addq %rdx,%rcx
-        jc  bad_to_user
+        jc bad_to_user
-        cmpq threadinfo_addr_limit(%rax),%rcx
+        cmpq TI_addr_limit(%rax),%rcx
        jae bad_to_user
-        xorl %eax,%eax  /* clear zero flag */
        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
        CFI_ENDPROC
-ENTRY(copy_user_generic)
+/* Standard copy_from_user with segment limit checking */
+ENTRY(copy_from_user)
        CFI_STARTPROC
-        movl $1,%ecx    /* set zero flag */
+        GET_THREAD_INFO(%rax)
+        movq %rsi,%rcx
+        addq %rdx,%rcx
+        jc bad_from_user
+        cmpq TI_addr_limit(%rax),%rcx
+        jae bad_from_user
        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
        CFI_ENDPROC
+ENDPROC(copy_from_user)
-ENTRY(__copy_from_user_inatomic)
+ENTRY(copy_user_generic)
        CFI_STARTPROC
-        xorl %ecx,%ecx  /* clear zero flag */
        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
        CFI_ENDPROC
+ENDPROC(copy_user_generic)
-/* Standard copy_from_user with segment limit checking */       
+ENTRY(__copy_from_user_inatomic)
-ENTRY(copy_from_user)
        CFI_STARTPROC
-        GET_THREAD_INFO(%rax)
-        movq %rsi,%rcx
-        addq %rdx,%rcx
-        jc  bad_from_user
-        cmpq threadinfo_addr_limit(%rax),%rcx
-        jae  bad_from_user
-        movl $1,%ecx    /* set zero flag */
        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
        CFI_ENDPROC
-ENDPROC(copy_from_user)
+ENDPROC(__copy_from_user_inatomic)
-        
        .section .fixup,"ax"
        /* must zero dest */
+ENTRY(bad_from_user)
 bad_from_user:
        CFI_STARTPROC
        movl %edx,%ecx
@@ -81,274 +111,158 @@ bad_from_user:
        rep
        stosb
 bad_to_user:
-        movl    %edx,%eax
+        movl %edx,%eax
        ret
        CFI_ENDPROC
-END(bad_from_user)
+ENDPROC(bad_from_user)
        .previous
-        
-                
 /*
 * copy_user_generic_unrolled - memory copy with exception handling.
- * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
+ * This version is for CPUs like P4 that don't have efficient micro
- *      
+ * code for rep movsq
- * Input:       
+ *
+ * Input:
 * rdi destination
 * rsi source
 * rdx count
- * ecx zero flag -- if true zero destination on error
 *
- * Output:              
+ * Output:
- * eax uncopied bytes or 0 if successful.
+ * eax uncopied bytes or 0 if successfull.
 */
 ENTRY(copy_user_generic_unrolled)
        CFI_STARTPROC
-        pushq %rbx
+        cmpl $8,%edx
-        CFI_ADJUST_CFA_OFFSET 8
+        jb 20f          /* less then 8 bytes, go to byte copy loop */
-        CFI_REL_OFFSET rbx, 0
+        ALIGN_DESTINATION
-        pushq %rcx
+        movl %edx,%ecx
-        CFI_ADJUST_CFA_OFFSET 8
+        andl $63,%edx
-        CFI_REL_OFFSET rcx, 0
+        shrl $6,%ecx
-        xorl %eax,%eax          /*zero for the exception handler */
+        jz 17f
+1:      movq (%rsi),%r8
-#ifdef FIX_ALIGNMENT
+2:      movq 1*8(%rsi),%r9
-        /* check for bad alignment of destination */
+3:      movq 2*8(%rsi),%r10
-        movl %edi,%ecx
+4:      movq 3*8(%rsi),%r11
-        andl $7,%ecx
+5:      movq %r8,(%rdi)
-        jnz  .Lbad_alignment
+6:      movq %r9,1*8(%rdi)
-.Lafter_bad_alignment:
+7:      movq %r10,2*8(%rdi)
-#endif
+8:      movq %r11,3*8(%rdi)
+9:      movq 4*8(%rsi),%r8
-        movq %rdx,%rcx
+10:     movq 5*8(%rsi),%r9
+11:     movq 6*8(%rsi),%r10
-        movl $64,%ebx
+12:     movq 7*8(%rsi),%r11
-        shrq $6,%rdx
+13:     movq %r8,4*8(%rdi)
-        decq %rdx
+14:     movq %r9,5*8(%rdi)
-        js   .Lhandle_tail
+15:     movq %r10,6*8(%rdi)
+16:     movq %r11,7*8(%rdi)
-        .p2align 4
-.Lloop:
-.Ls1:   movq (%rsi),%r11
-.Ls2:   movq 1*8(%rsi),%r8
-.Ls3:   movq 2*8(%rsi),%r9
-.Ls4:   movq 3*8(%rsi),%r10
-.Ld1:   movq %r11,(%rdi)
-.Ld2:   movq %r8,1*8(%rdi)
-.Ld3:   movq %r9,2*8(%rdi)
-.Ld4:   movq %r10,3*8(%rdi)
-.Ls5:   movq 4*8(%rsi),%r11
-.Ls6:   movq 5*8(%rsi),%r8
-.Ls7:   movq 6*8(%rsi),%r9
-.Ls8:   movq 7*8(%rsi),%r10
-.Ld5:   movq %r11,4*8(%rdi)
-.Ld6:   movq %r8,5*8(%rdi)
-.Ld7:   movq %r9,6*8(%rdi)
-.Ld8:   movq %r10,7*8(%rdi)
-        decq %rdx
        leaq 64(%rsi),%rsi
        leaq 64(%rdi),%rdi
-        jns  .Lloop
-        .p2align 4
-.Lhandle_tail:
-        movl %ecx,%edx
-        andl $63,%ecx
-        shrl $3,%ecx
-        jz   .Lhandle_7
-        movl $8,%ebx
-        .p2align 4
-.Lloop_8:
-.Ls9:   movq (%rsi),%r8
-.Ld9:   movq %r8,(%rdi)
        decl %ecx
-        leaq 8(%rdi),%rdi
+        jnz 1b
+17:     movl %edx,%ecx
+        andl $7,%edx
+        shrl $3,%ecx
+        jz 20f
+18:     movq (%rsi),%r8
+19:     movq %r8,(%rdi)
        leaq 8(%rsi),%rsi
-        jnz .Lloop_8
+        leaq 8(%rdi),%rdi
+        decl %ecx
-.Lhandle_7:
+        jnz 18b
+20:     andl %edx,%edx
+        jz 23f
        movl %edx,%ecx
-        andl $7,%ecx
+21:     movb (%rsi),%al
-        jz   .Lende
+22:     movb %al,(%rdi)
-        .p2align 4
-.Lloop_1:
-.Ls10:  movb (%rsi),%bl
-.Ld10:  movb %bl,(%rdi)
-        incq %rdi
        incq %rsi
+        incq %rdi
        decl %ecx
-        jnz .Lloop_1
+        jnz 21b
+23:     xor %eax,%eax
-        CFI_REMEMBER_STATE
-.Lende:
-        popq %rcx
-        CFI_ADJUST_CFA_OFFSET -8
-        CFI_RESTORE rcx
-        popq %rbx
-        CFI_ADJUST_CFA_OFFSET -8
-        CFI_RESTORE rbx
        ret
-        CFI_RESTORE_STATE
-#ifdef FIX_ALIGNMENT
+        .section .fixup,"ax"
-        /* align destination */
+30:     shll $6,%ecx
-        .p2align 4
+        addl %ecx,%edx
-.Lbad_alignment:
+        jmp 60f
-        movl $8,%r9d
+40:     lea (%rdx,%rcx,8),%rdx
-        subl %ecx,%r9d
+        jmp 60f
-        movl %r9d,%ecx
+50:     movl %ecx,%edx
-        cmpq %r9,%rdx
+60:     jmp copy_user_handle_tail /* ecx is zerorest also */
-        jz   .Lhandle_7
+        .previous
-        js   .Lhandle_7
-.Lalign_1:
-.Ls11:  movb (%rsi),%bl
-.Ld11:  movb %bl,(%rdi)
-        incq %rsi
-        incq %rdi
-        decl %ecx
-        jnz .Lalign_1
-        subq %r9,%rdx
-        jmp .Lafter_bad_alignment
-#endif
-        /* table sorted by exception address */
        .section __ex_table,"a"
        .align 8
-        .quad .Ls1,.Ls1e
+        .quad 1b,30b
-        .quad .Ls2,.Ls2e
+        .quad 2b,30b
-        .quad .Ls3,.Ls3e
+        .quad 3b,30b
-        .quad .Ls4,.Ls4e
+        .quad 4b,30b
-        .quad .Ld1,.Ls1e
+        .quad 5b,30b
-        .quad .Ld2,.Ls2e
+        .quad 6b,30b
-        .quad .Ld3,.Ls3e
+        .quad 7b,30b
-        .quad .Ld4,.Ls4e
+        .quad 8b,30b
-        .quad .Ls5,.Ls5e
+        .quad 9b,30b
-        .quad .Ls6,.Ls6e
+        .quad 10b,30b
-        .quad .Ls7,.Ls7e
+        .quad 11b,30b
-        .quad .Ls8,.Ls8e
+        .quad 12b,30b
-        .quad .Ld5,.Ls5e
+        .quad 13b,30b
-        .quad .Ld6,.Ls6e
+        .quad 14b,30b
-        .quad .Ld7,.Ls7e
+        .quad 15b,30b
-        .quad .Ld8,.Ls8e
+        .quad 16b,30b
-        .quad .Ls9,.Le_quad
+        .quad 18b,40b
-        .quad .Ld9,.Le_quad
+        .quad 19b,40b
-        .quad .Ls10,.Le_byte
+        .quad 21b,50b
-        .quad .Ld10,.Le_byte
+        .quad 22b,50b
-#ifdef FIX_ALIGNMENT
-        .quad .Ls11,.Lzero_rest
-        .quad .Ld11,.Lzero_rest
-#endif
-        .quad .Le5,.Le_zero
        .previous
-        /* compute 64-offset for main loop. 8 bytes accuracy with error on the
-           pessimistic side. this is gross. it would be better to fix the
-           interface. */
-        /* eax: zero, ebx: 64 */
-.Ls1e:  addl $8,%eax
-.Ls2e:  addl $8,%eax
-.Ls3e:  addl $8,%eax
-.Ls4e:  addl $8,%eax
-.Ls5e:  addl $8,%eax
-.Ls6e:  addl $8,%eax
-.Ls7e:  addl $8,%eax
-.Ls8e:  addl $8,%eax
-        addq %rbx,%rdi  /* +64 */
-        subq %rax,%rdi  /* correct destination with computed offset */
-        shlq $6,%rdx    /* loop counter * 64 (stride length) */
-        addq %rax,%rdx  /* add offset to loopcnt */
-        andl $63,%ecx   /* remaining bytes */
-        addq %rcx,%rdx  /* add them */
-        jmp .Lzero_rest
-        /* exception on quad word loop in tail handling */
-        /* ecx: loopcnt/8, %edx: length, rdi: correct */
-.Le_quad:
-        shll $3,%ecx
-        andl $7,%edx
-        addl %ecx,%edx
-        /* edx: bytes to zero, rdi: dest, eax:zero */
-.Lzero_rest:
-        cmpl $0,(%rsp)
-        jz   .Le_zero
-        movq %rdx,%rcx
-.Le_byte:
-        xorl %eax,%eax
-.Le5:   rep
-        stosb
-        /* when there is another exception while zeroing the rest just return */
-.Le_zero:
-        movq %rdx,%rax
-        jmp .Lende
        CFI_ENDPROC
-ENDPROC(copy_user_generic)
+ENDPROC(copy_user_generic_unrolled)
+/* Some CPUs run faster using the string copy instructions.
-        /* Some CPUs run faster using the string copy instructions.
+ * This is also a lot simpler. Use them when possible.
-           This is also a lot simpler. Use them when possible.
+ *
-           Patch in jmps to this code instead of copying it fully
+ * Only 4GB of copy is supported. This shouldn't be a problem
-           to avoid unwanted aliasing in the exception tables. */
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
- /* rdi destination
+ * And more would be dangerous because both Intel and AMD have
-  * rsi source
+ * errata with rep movsq > 4GB. If someone feels the need to fix
-  * rdx count
+ * this please consider this.
-  * ecx zero flag
+ *
-  *
+ * Input:
-  * Output:
+ * rdi destination
-  * eax uncopied bytes or 0 if successfull.
+ * rsi source
-  *
+ * rdx count
-  * Only 4GB of copy is supported. This shouldn't be a problem
+ *
-  * because the kernel normally only writes from/to page sized chunks
+ * Output:
-  * even if user space passed a longer buffer.
+ * eax uncopied bytes or 0 if successful.
-  * And more would be dangerous because both Intel and AMD have
+ */
-  * errata with rep movsq > 4GB. If someone feels the need to fix
-  * this please consider this.
-  */
 ENTRY(copy_user_generic_string)
        CFI_STARTPROC
-        movl %ecx,%r8d          /* save zero flag */
+        andl %edx,%edx
+        jz 4f
+        cmpl $8,%edx
+        jb 2f           /* less than 8 bytes, go to byte copy loop */
+        ALIGN_DESTINATION
        movl %edx,%ecx
        shrl $3,%ecx
-        andl $7,%edx    
+        andl $7,%edx
-        jz   10f
+1:      rep
-1:      rep 
-        movsq 
-        movl %edx,%ecx
-2:      rep
-        movsb
-9:      movl %ecx,%eax
-        ret
-        /* multiple of 8 byte */
-10:     rep
        movsq
-        xor %eax,%eax
+2:      movl %edx,%ecx
+3:      rep
+        movsb
+4:      xorl %eax,%eax
        ret
-        /* exception handling */
+        .section .fixup,"ax"
-3:      lea (%rdx,%rcx,8),%rax  /* exception on quad loop */
+11:     lea (%rdx,%rcx,8),%rcx
-        jmp 6f
+12:     movl %ecx,%edx          /* ecx is zerorest also */
-5:      movl %ecx,%eax          /* exception on byte loop */
+        jmp copy_user_handle_tail
-        /* eax: left over bytes */
+        .previous
-6:      testl %r8d,%r8d         /* zero flag set? */
-        jz 7f
-        movl %eax,%ecx          /* initialize x86 loop counter */
-        push %rax
-        xorl %eax,%eax
-8:      rep
-        stosb                   /* zero the rest */
-11:     pop %rax
-7:      ret
-        CFI_ENDPROC
-END(copy_user_generic_c)
        .section __ex_table,"a"
-        .quad 1b,3b
+        .align 8
-        .quad 2b,5b
+        .quad 1b,11b
-        .quad 8b,11b
+        .quad 3b,12b
-        .quad 10b,3b
        .previous
+        CFI_ENDPROC
+ENDPROC(copy_user_generic_string)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 70bebd310408..dfdf428975c0 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S
@@ -1,8 +1,10 @@
1	/* Copyright 2002 Andi Kleen, SuSE Labs.	1	/*
		2	* Copyright 2008 Vitaly Mayatskikh <vmayatsk@redhat.com>
		3	* Copyright 2002 Andi Kleen, SuSE Labs.
2	* Subject to the GNU Public License v2.	4	* Subject to the GNU Public License v2.
3	*	5	*
4	* Functions to copy from and to user space.	6	* Functions to copy from and to user space.
5	*/	7	*/
6		8
7	#include <linux/linkage.h>	9	#include <linux/linkage.h>
8	#include <asm/dwarf2.h>	10	#include <asm/dwarf2.h>
@@ -20,60 +22,88 @@
20	.long \orig-1f /* by default jump to orig */	22	.long \orig-1f /* by default jump to orig */
21	1:	23	1:
22	.section .altinstr_replacement,"ax"	24	.section .altinstr_replacement,"ax"
23	2: .byte 0xe9 /* near jump with 32bit immediate */	25	2: .byte 0xe9 /* near jump with 32bit immediate */
24	.long \alt-1b /* offset / / or alternatively to alt */	26	.long \alt-1b /* offset / / or alternatively to alt */
25	.previous	27	.previous
26	.section .altinstructions,"a"	28	.section .altinstructions,"a"
27	.align 8	29	.align 8
28	.quad 0b	30	.quad 0b
29	.quad 2b	31	.quad 2b
30	.byte \feature /* when feature is set */	32	.byte \feature /* when feature is set */
31	.byte 5	33	.byte 5
32	.byte 5	34	.byte 5
33	.previous	35	.previous
34	.endm	36	.endm
35		37
36	/* Standard copy_to_user with segment limit checking */	38	.macro ALIGN_DESTINATION
		39	#ifdef FIX_ALIGNMENT
		40	/* check for bad alignment of destination */
		41	movl %edi,%ecx
		42	andl $7,%ecx
		43	jz 102f /* already aligned */
		44	subl $8,%ecx
		45	negl %ecx
		46	subl %ecx,%edx
		47	100: movb (%rsi),%al
		48	101: movb %al,(%rdi)
		49	incq %rsi
		50	incq %rdi
		51	decl %ecx
		52	jnz 100b
		53	102:
		54	.section .fixup,"ax"
		55	103: addl %r8d,%edx /* ecx is zerorest also */
		56	jmp copy_user_handle_tail
		57	.previous
		58
		59	.section __ex_table,"a"
		60	.align 8
		61	.quad 100b,103b
		62	.quad 101b,103b
		63	.previous
		64	#endif
		65	.endm
		66
		67	/* Standard copy_to_user with segment limit checking */
37	ENTRY(copy_to_user)	68	ENTRY(copy_to_user)
38	CFI_STARTPROC	69	CFI_STARTPROC
39	GET_THREAD_INFO(%rax)	70	GET_THREAD_INFO(%rax)
40	movq %rdi,%rcx	71	movq %rdi,%rcx
41	addq %rdx,%rcx	72	addq %rdx,%rcx
42	jc bad_to_user	73	jc bad_to_user
43	cmpq threadinfo_addr_limit(%rax),%rcx	74	cmpq TI_addr_limit(%rax),%rcx
44	jae bad_to_user	75	jae bad_to_user
45	xorl %eax,%eax /* clear zero flag */
46	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string	76	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
47	CFI_ENDPROC	77	CFI_ENDPROC
48		78
49	ENTRY(copy_user_generic)	79	/* Standard copy_from_user with segment limit checking */
		80	ENTRY(copy_from_user)
50	CFI_STARTPROC	81	CFI_STARTPROC
51	movl $1,%ecx /* set zero flag */	82	GET_THREAD_INFO(%rax)
		83	movq %rsi,%rcx
		84	addq %rdx,%rcx
		85	jc bad_from_user
		86	cmpq TI_addr_limit(%rax),%rcx
		87	jae bad_from_user
52	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string	88	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
53	CFI_ENDPROC	89	CFI_ENDPROC
		90	ENDPROC(copy_from_user)
54		91
55	ENTRY(__copy_from_user_inatomic)	92	ENTRY(copy_user_generic)
56	CFI_STARTPROC	93	CFI_STARTPROC
57	xorl %ecx,%ecx /* clear zero flag */
58	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string	94	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
59	CFI_ENDPROC	95	CFI_ENDPROC
		96	ENDPROC(copy_user_generic)
60		97
61	/* Standard copy_from_user with segment limit checking */	98	ENTRY(__copy_from_user_inatomic)
62	ENTRY(copy_from_user)
63	CFI_STARTPROC	99	CFI_STARTPROC
64	GET_THREAD_INFO(%rax)
65	movq %rsi,%rcx
66	addq %rdx,%rcx
67	jc bad_from_user
68	cmpq threadinfo_addr_limit(%rax),%rcx
69	jae bad_from_user
70	movl $1,%ecx /* set zero flag */
71	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string	100	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
72	CFI_ENDPROC	101	CFI_ENDPROC
73	ENDPROC(copy_from_user)	102	ENDPROC(__copy_from_user_inatomic)
74		103
75	.section .fixup,"ax"	104	.section .fixup,"ax"
76	/* must zero dest */	105	/* must zero dest */
		106	ENTRY(bad_from_user)
77	bad_from_user:	107	bad_from_user:
78	CFI_STARTPROC	108	CFI_STARTPROC
79	movl %edx,%ecx	109	movl %edx,%ecx
@@ -81,274 +111,158 @@ bad_from_user:
81	rep	111	rep
82	stosb	112	stosb
83	bad_to_user:	113	bad_to_user:
84	movl %edx,%eax	114	movl %edx,%eax
85	ret	115	ret
86	CFI_ENDPROC	116	CFI_ENDPROC
87	END(bad_from_user)	117	ENDPROC(bad_from_user)
88	.previous	118	.previous
89		119
90
91	/*	120	/*
92	* copy_user_generic_unrolled - memory copy with exception handling.	121	* copy_user_generic_unrolled - memory copy with exception handling.
93	* This version is for CPUs like P4 that don't have efficient micro code for rep movsq	122	* This version is for CPUs like P4 that don't have efficient micro
94	*	123	* code for rep movsq
95	* Input:	124	*
		125	* Input:
96	* rdi destination	126	* rdi destination
97	* rsi source	127	* rsi source
98	* rdx count	128	* rdx count
99	* ecx zero flag -- if true zero destination on error
100	*	129	*
101	* Output:	130	* Output:
102	* eax uncopied bytes or 0 if successful.	131	* eax uncopied bytes or 0 if successfull.
103	*/	132	*/
104	ENTRY(copy_user_generic_unrolled)	133	ENTRY(copy_user_generic_unrolled)
105	CFI_STARTPROC	134	CFI_STARTPROC
106	pushq %rbx	135	cmpl $8,%edx
107	CFI_ADJUST_CFA_OFFSET 8	136	jb 20f /* less then 8 bytes, go to byte copy loop */
108	CFI_REL_OFFSET rbx, 0	137	ALIGN_DESTINATION
109	pushq %rcx	138	movl %edx,%ecx
110	CFI_ADJUST_CFA_OFFSET 8	139	andl $63,%edx
111	CFI_REL_OFFSET rcx, 0	140	shrl $6,%ecx
112	xorl %eax,%eax /zero for the exception handler /	141	jz 17f
113		142	1: movq (%rsi),%r8
114	#ifdef FIX_ALIGNMENT	143	2: movq 1*8(%rsi),%r9
115	/* check for bad alignment of destination */	144	3: movq 2*8(%rsi),%r10
116	movl %edi,%ecx	145	4: movq 3*8(%rsi),%r11
117	andl $7,%ecx	146	5: movq %r8,(%rdi)
118	jnz .Lbad_alignment	147	6: movq %r9,1*8(%rdi)
119	.Lafter_bad_alignment:	148	7: movq %r10,2*8(%rdi)
120	#endif	149	8: movq %r11,3*8(%rdi)
121		150	9: movq 4*8(%rsi),%r8
122	movq %rdx,%rcx	151	10: movq 5*8(%rsi),%r9
123		152	11: movq 6*8(%rsi),%r10
124	movl $64,%ebx	153	12: movq 7*8(%rsi),%r11
125	shrq $6,%rdx	154	13: movq %r8,4*8(%rdi)
126	decq %rdx	155	14: movq %r9,5*8(%rdi)
127	js .Lhandle_tail	156	15: movq %r10,6*8(%rdi)
128		157	16: movq %r11,7*8(%rdi)
129	.p2align 4
130	.Lloop:
131	.Ls1: movq (%rsi),%r11
132	.Ls2: movq 1*8(%rsi),%r8
133	.Ls3: movq 2*8(%rsi),%r9
134	.Ls4: movq 3*8(%rsi),%r10
135	.Ld1: movq %r11,(%rdi)
136	.Ld2: movq %r8,1*8(%rdi)
137	.Ld3: movq %r9,2*8(%rdi)
138	.Ld4: movq %r10,3*8(%rdi)
139
140	.Ls5: movq 4*8(%rsi),%r11
141	.Ls6: movq 5*8(%rsi),%r8
142	.Ls7: movq 6*8(%rsi),%r9
143	.Ls8: movq 7*8(%rsi),%r10
144	.Ld5: movq %r11,4*8(%rdi)
145	.Ld6: movq %r8,5*8(%rdi)
146	.Ld7: movq %r9,6*8(%rdi)
147	.Ld8: movq %r10,7*8(%rdi)
148
149	decq %rdx
150
151	leaq 64(%rsi),%rsi	158	leaq 64(%rsi),%rsi
152	leaq 64(%rdi),%rdi	159	leaq 64(%rdi),%rdi
153
154	jns .Lloop
155
156	.p2align 4
157	.Lhandle_tail:
158	movl %ecx,%edx
159	andl $63,%ecx
160	shrl $3,%ecx
161	jz .Lhandle_7
162	movl $8,%ebx
163	.p2align 4
164	.Lloop_8:
165	.Ls9: movq (%rsi),%r8
166	.Ld9: movq %r8,(%rdi)
167	decl %ecx	160	decl %ecx
168	leaq 8(%rdi),%rdi	161	jnz 1b
		162	17: movl %edx,%ecx
		163	andl $7,%edx
		164	shrl $3,%ecx
		165	jz 20f
		166	18: movq (%rsi),%r8
		167	19: movq %r8,(%rdi)
169	leaq 8(%rsi),%rsi	168	leaq 8(%rsi),%rsi
170	jnz .Lloop_8	169	leaq 8(%rdi),%rdi
171		170	decl %ecx
172	.Lhandle_7:	171	jnz 18b
		172	20: andl %edx,%edx
		173	jz 23f
173	movl %edx,%ecx	174	movl %edx,%ecx
174	andl $7,%ecx	175	21: movb (%rsi),%al
175	jz .Lende	176	22: movb %al,(%rdi)
176	.p2align 4
177	.Lloop_1:
178	.Ls10: movb (%rsi),%bl
179	.Ld10: movb %bl,(%rdi)
180	incq %rdi
181	incq %rsi	177	incq %rsi
		178	incq %rdi
182	decl %ecx	179	decl %ecx
183	jnz .Lloop_1	180	jnz 21b
184		181	23: xor %eax,%eax
185	CFI_REMEMBER_STATE
186	.Lende:
187	popq %rcx
188	CFI_ADJUST_CFA_OFFSET -8
189	CFI_RESTORE rcx
190	popq %rbx
191	CFI_ADJUST_CFA_OFFSET -8
192	CFI_RESTORE rbx
193	ret	182	ret
194	CFI_RESTORE_STATE
195		183
196	#ifdef FIX_ALIGNMENT	184	.section .fixup,"ax"
197	/* align destination */	185	30: shll $6,%ecx
198	.p2align 4	186	addl %ecx,%edx
199	.Lbad_alignment:	187	jmp 60f
200	movl $8,%r9d	188	40: lea (%rdx,%rcx,8),%rdx
201	subl %ecx,%r9d	189	jmp 60f
202	movl %r9d,%ecx	190	50: movl %ecx,%edx
203	cmpq %r9,%rdx	191	60: jmp copy_user_handle_tail /* ecx is zerorest also */
204	jz .Lhandle_7	192	.previous
205	js .Lhandle_7
206	.Lalign_1:
207	.Ls11: movb (%rsi),%bl
208	.Ld11: movb %bl,(%rdi)
209	incq %rsi
210	incq %rdi
211	decl %ecx
212	jnz .Lalign_1
213	subq %r9,%rdx
214	jmp .Lafter_bad_alignment
215	#endif
216		193
217	/* table sorted by exception address */
218	.section __ex_table,"a"	194	.section __ex_table,"a"
219	.align 8	195	.align 8
220	.quad .Ls1,.Ls1e	196	.quad 1b,30b
221	.quad .Ls2,.Ls2e	197	.quad 2b,30b
222	.quad .Ls3,.Ls3e	198	.quad 3b,30b
223	.quad .Ls4,.Ls4e	199	.quad 4b,30b
224	.quad .Ld1,.Ls1e	200	.quad 5b,30b
225	.quad .Ld2,.Ls2e	201	.quad 6b,30b
226	.quad .Ld3,.Ls3e	202	.quad 7b,30b
227	.quad .Ld4,.Ls4e	203	.quad 8b,30b
228	.quad .Ls5,.Ls5e	204	.quad 9b,30b
229	.quad .Ls6,.Ls6e	205	.quad 10b,30b
230	.quad .Ls7,.Ls7e	206	.quad 11b,30b
231	.quad .Ls8,.Ls8e	207	.quad 12b,30b
232	.quad .Ld5,.Ls5e	208	.quad 13b,30b
233	.quad .Ld6,.Ls6e	209	.quad 14b,30b
234	.quad .Ld7,.Ls7e	210	.quad 15b,30b
235	.quad .Ld8,.Ls8e	211	.quad 16b,30b
236	.quad .Ls9,.Le_quad	212	.quad 18b,40b
237	.quad .Ld9,.Le_quad	213	.quad 19b,40b
238	.quad .Ls10,.Le_byte	214	.quad 21b,50b
239	.quad .Ld10,.Le_byte	215	.quad 22b,50b
240	#ifdef FIX_ALIGNMENT
241	.quad .Ls11,.Lzero_rest
242	.quad .Ld11,.Lzero_rest
243	#endif
244	.quad .Le5,.Le_zero
245	.previous	216	.previous
246
247	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
248	pessimistic side. this is gross. it would be better to fix the
249	interface. */
250	/* eax: zero, ebx: 64 */
251	.Ls1e: addl $8,%eax
252	.Ls2e: addl $8,%eax
253	.Ls3e: addl $8,%eax
254	.Ls4e: addl $8,%eax
255	.Ls5e: addl $8,%eax
256	.Ls6e: addl $8,%eax
257	.Ls7e: addl $8,%eax
258	.Ls8e: addl $8,%eax
259	addq %rbx,%rdi /* +64 */
260	subq %rax,%rdi /* correct destination with computed offset */
261
262	shlq $6,%rdx /* loop counter * 64 (stride length) */
263	addq %rax,%rdx /* add offset to loopcnt */
264	andl $63,%ecx /* remaining bytes */
265	addq %rcx,%rdx /* add them */
266	jmp .Lzero_rest
267
268	/* exception on quad word loop in tail handling */
269	/* ecx: loopcnt/8, %edx: length, rdi: correct */
270	.Le_quad:
271	shll $3,%ecx
272	andl $7,%edx
273	addl %ecx,%edx
274	/* edx: bytes to zero, rdi: dest, eax:zero */
275	.Lzero_rest:
276	cmpl $0,(%rsp)
277	jz .Le_zero
278	movq %rdx,%rcx
279	.Le_byte:
280	xorl %eax,%eax
281	.Le5: rep
282	stosb
283	/* when there is another exception while zeroing the rest just return */
284	.Le_zero:
285	movq %rdx,%rax
286	jmp .Lende
287	CFI_ENDPROC	217	CFI_ENDPROC
288	ENDPROC(copy_user_generic)	218	ENDPROC(copy_user_generic_unrolled)
289		219
290		220	/* Some CPUs run faster using the string copy instructions.
291	/* Some CPUs run faster using the string copy instructions.	221	* This is also a lot simpler. Use them when possible.
292	This is also a lot simpler. Use them when possible.	222	*
293	Patch in jmps to this code instead of copying it fully	223	* Only 4GB of copy is supported. This shouldn't be a problem
294	to avoid unwanted aliasing in the exception tables. */	224	* because the kernel normally only writes from/to page sized chunks
295		225	* even if user space passed a longer buffer.
296	/* rdi destination	226	* And more would be dangerous because both Intel and AMD have
297	* rsi source	227	* errata with rep movsq > 4GB. If someone feels the need to fix
298	* rdx count	228	* this please consider this.
299	* ecx zero flag	229	*
300	*	230	* Input:
301	* Output:	231	* rdi destination
302	* eax uncopied bytes or 0 if successfull.	232	* rsi source
303	*	233	* rdx count
304	* Only 4GB of copy is supported. This shouldn't be a problem	234	*
305	* because the kernel normally only writes from/to page sized chunks	235	* Output:
306	* even if user space passed a longer buffer.	236	* eax uncopied bytes or 0 if successful.
307	* And more would be dangerous because both Intel and AMD have	237	*/
308	* errata with rep movsq > 4GB. If someone feels the need to fix
309	* this please consider this.
310	*/
311	ENTRY(copy_user_generic_string)	238	ENTRY(copy_user_generic_string)
312	CFI_STARTPROC	239	CFI_STARTPROC
313	movl %ecx,%r8d /* save zero flag */	240	andl %edx,%edx
		241	jz 4f
		242	cmpl $8,%edx
		243	jb 2f /* less than 8 bytes, go to byte copy loop */
		244	ALIGN_DESTINATION
314	movl %edx,%ecx	245	movl %edx,%ecx
315	shrl $3,%ecx	246	shrl $3,%ecx
316	andl $7,%edx	247	andl $7,%edx
317	jz 10f	248	1: rep
318	1: rep
319	movsq
320	movl %edx,%ecx
321	2: rep
322	movsb
323	9: movl %ecx,%eax
324	ret
325
326	/* multiple of 8 byte */
327	10: rep
328	movsq	249	movsq
329	xor %eax,%eax	250	2: movl %edx,%ecx
		251	3: rep
		252	movsb
		253	4: xorl %eax,%eax
330	ret	254	ret
331		255
332	/* exception handling */	256	.section .fixup,"ax"
333	3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */	257	11: lea (%rdx,%rcx,8),%rcx
334	jmp 6f	258	12: movl %ecx,%edx /* ecx is zerorest also */
335	5: movl %ecx,%eax /* exception on byte loop */	259	jmp copy_user_handle_tail
336	/* eax: left over bytes */	260	.previous
337	6: testl %r8d,%r8d /* zero flag set? */
338	jz 7f
339	movl %eax,%ecx /* initialize x86 loop counter */
340	push %rax
341	xorl %eax,%eax
342	8: rep
343	stosb /* zero the rest */
344	11: pop %rax
345	7: ret
346	CFI_ENDPROC
347	END(copy_user_generic_c)
348		261
349	.section __ex_table,"a"	262	.section __ex_table,"a"
350	.quad 1b,3b	263	.align 8
351	.quad 2b,5b	264	.quad 1b,11b
352	.quad 8b,11b	265	.quad 3b,12b
353	.quad 10b,3b
354	.previous	266	.previous
		267	CFI_ENDPROC
		268	ENDPROC(copy_user_generic_string)