x86_64: move lib

Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 05:17:08 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2007-10-11 05:17:08 -0400
commit: 185f3d38900f750a4566f87cde6a178f3595a115 (patch)
tree: d463f6da1af452b1bbdf476828ea88427087f255 /arch/x86/lib/copy_user_64.S
parent: 51b2833060f26258ea2da091c7b9c6a358ac9dd2 (diff)
1 files changed, 354 insertions, 0 deletions
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
new file mode 100644
index 000000000000..70bebd310408
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ * 
+ * Functions to copy from and to user space.            
+ */              
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#define FIX_ALIGNMENT 1
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+        .macro ALTERNATIVE_JUMP feature,orig,alt
+0:
+        .byte 0xe9      /* 32bit jump */
+        .long \orig-1f  /* by default jump to orig */
+1:
+        .section .altinstr_replacement,"ax"
+2:      .byte 0xe9                   /* near jump with 32bit immediate */
+        .long \alt-1b /* offset */   /* or alternatively to alt */
+        .previous
+        .section .altinstructions,"a"
+        .align 8
+        .quad  0b
+        .quad  2b
+        .byte  \feature              /* when feature is set */
+        .byte  5
+        .byte  5
+        .previous
+        .endm
+/* Standard copy_to_user with segment limit checking */         
+ENTRY(copy_to_user)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%rax)
+        movq %rdi,%rcx
+        addq %rdx,%rcx
+        jc  bad_to_user
+        cmpq threadinfo_addr_limit(%rax),%rcx
+        jae bad_to_user
+        xorl %eax,%eax  /* clear zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+ENTRY(copy_user_generic)
+        CFI_STARTPROC
+        movl $1,%ecx    /* set zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+ENTRY(__copy_from_user_inatomic)
+        CFI_STARTPROC
+        xorl %ecx,%ecx  /* clear zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+/* Standard copy_from_user with segment limit checking */       
+ENTRY(copy_from_user)
+        CFI_STARTPROC
+        GET_THREAD_INFO(%rax)
+        movq %rsi,%rcx
+        addq %rdx,%rcx
+        jc  bad_from_user
+        cmpq threadinfo_addr_limit(%rax),%rcx
+        jae  bad_from_user
+        movl $1,%ecx    /* set zero flag */
+        ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+        CFI_ENDPROC
+ENDPROC(copy_from_user)
+        
+        .section .fixup,"ax"
+        /* must zero dest */
+bad_from_user:
+        CFI_STARTPROC
+        movl %edx,%ecx
+        xorl %eax,%eax
+        rep
+        stosb
+bad_to_user:
+        movl    %edx,%eax
+        ret
+        CFI_ENDPROC
+END(bad_from_user)
+        .previous
+        
+                
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
+ *      
+ * Input:       
+ * rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag -- if true zero destination on error
+ *
+ * Output:              
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_unrolled)
+        CFI_STARTPROC
+        pushq %rbx
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rbx, 0
+        pushq %rcx
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rcx, 0
+        xorl %eax,%eax          /*zero for the exception handler */
+#ifdef FIX_ALIGNMENT
+        /* check for bad alignment of destination */
+        movl %edi,%ecx
+        andl $7,%ecx
+        jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+        movq %rdx,%rcx
+        movl $64,%ebx
+        shrq $6,%rdx
+        decq %rdx
+        js   .Lhandle_tail
+        .p2align 4
+.Lloop:
+.Ls1:   movq (%rsi),%r11
+.Ls2:   movq 1*8(%rsi),%r8
+.Ls3:   movq 2*8(%rsi),%r9
+.Ls4:   movq 3*8(%rsi),%r10
+.Ld1:   movq %r11,(%rdi)
+.Ld2:   movq %r8,1*8(%rdi)
+.Ld3:   movq %r9,2*8(%rdi)
+.Ld4:   movq %r10,3*8(%rdi)
+.Ls5:   movq 4*8(%rsi),%r11
+.Ls6:   movq 5*8(%rsi),%r8
+.Ls7:   movq 6*8(%rsi),%r9
+.Ls8:   movq 7*8(%rsi),%r10
+.Ld5:   movq %r11,4*8(%rdi)
+.Ld6:   movq %r8,5*8(%rdi)
+.Ld7:   movq %r9,6*8(%rdi)
+.Ld8:   movq %r10,7*8(%rdi)
+        decq %rdx
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        jns  .Lloop
+        .p2align 4
+.Lhandle_tail:
+        movl %ecx,%edx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        movl $8,%ebx
+        .p2align 4
+.Lloop_8:
+.Ls9:   movq (%rsi),%r8
+.Ld9:   movq %r8,(%rdi)
+        decl %ecx
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz .Lloop_8
+.Lhandle_7:
+        movl %edx,%ecx
+        andl $7,%ecx
+        jz   .Lende
+        .p2align 4
+.Lloop_1:
+.Ls10:  movb (%rsi),%bl
+.Ld10:  movb %bl,(%rdi)
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+        CFI_REMEMBER_STATE
+.Lende:
+        popq %rcx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE rcx
+        popq %rbx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE rbx
+        ret
+        CFI_RESTORE_STATE
+#ifdef FIX_ALIGNMENT
+        /* align destination */
+        .p2align 4
+.Lbad_alignment:
+        movl $8,%r9d
+        subl %ecx,%r9d
+        movl %r9d,%ecx
+        cmpq %r9,%rdx
+        jz   .Lhandle_7
+        js   .Lhandle_7
+.Lalign_1:
+.Ls11:  movb (%rsi),%bl
+.Ld11:  movb %bl,(%rdi)
+        incq %rsi
+        incq %rdi
+        decl %ecx
+        jnz .Lalign_1
+        subq %r9,%rdx
+        jmp .Lafter_bad_alignment
+#endif
+        /* table sorted by exception address */
+        .section __ex_table,"a"
+        .align 8
+        .quad .Ls1,.Ls1e
+        .quad .Ls2,.Ls2e
+        .quad .Ls3,.Ls3e
+        .quad .Ls4,.Ls4e
+        .quad .Ld1,.Ls1e
+        .quad .Ld2,.Ls2e
+        .quad .Ld3,.Ls3e
+        .quad .Ld4,.Ls4e
+        .quad .Ls5,.Ls5e
+        .quad .Ls6,.Ls6e
+        .quad .Ls7,.Ls7e
+        .quad .Ls8,.Ls8e
+        .quad .Ld5,.Ls5e
+        .quad .Ld6,.Ls6e
+        .quad .Ld7,.Ls7e
+        .quad .Ld8,.Ls8e
+        .quad .Ls9,.Le_quad
+        .quad .Ld9,.Le_quad
+        .quad .Ls10,.Le_byte
+        .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+        .quad .Ls11,.Lzero_rest
+        .quad .Ld11,.Lzero_rest
+#endif
+        .quad .Le5,.Le_zero
+        .previous
+        /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+           pessimistic side. this is gross. it would be better to fix the
+           interface. */
+        /* eax: zero, ebx: 64 */
+.Ls1e:  addl $8,%eax
+.Ls2e:  addl $8,%eax
+.Ls3e:  addl $8,%eax
+.Ls4e:  addl $8,%eax
+.Ls5e:  addl $8,%eax
+.Ls6e:  addl $8,%eax
+.Ls7e:  addl $8,%eax
+.Ls8e:  addl $8,%eax
+        addq %rbx,%rdi  /* +64 */
+        subq %rax,%rdi  /* correct destination with computed offset */
+        shlq $6,%rdx    /* loop counter * 64 (stride length) */
+        addq %rax,%rdx  /* add offset to loopcnt */
+        andl $63,%ecx   /* remaining bytes */
+        addq %rcx,%rdx  /* add them */
+        jmp .Lzero_rest
+        /* exception on quad word loop in tail handling */
+        /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+        shll $3,%ecx
+        andl $7,%edx
+        addl %ecx,%edx
+        /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+        cmpl $0,(%rsp)
+        jz   .Le_zero
+        movq %rdx,%rcx
+.Le_byte:
+        xorl %eax,%eax
+.Le5:   rep
+        stosb
+        /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+        movq %rdx,%rax
+        jmp .Lende
+        CFI_ENDPROC
+ENDPROC(copy_user_generic)
+        /* Some CPUs run faster using the string copy instructions.
+           This is also a lot simpler. Use them when possible.
+           Patch in jmps to this code instead of copying it fully
+           to avoid unwanted aliasing in the exception tables. */
+ /* rdi destination
+  * rsi source
+  * rdx count
+  * ecx zero flag
+  *
+  * Output:
+  * eax uncopied bytes or 0 if successfull.
+  *
+  * Only 4GB of copy is supported. This shouldn't be a problem
+  * because the kernel normally only writes from/to page sized chunks
+  * even if user space passed a longer buffer.
+  * And more would be dangerous because both Intel and AMD have
+  * errata with rep movsq > 4GB. If someone feels the need to fix
+  * this please consider this.
+  */
+ENTRY(copy_user_generic_string)
+        CFI_STARTPROC
+        movl %ecx,%r8d          /* save zero flag */
+        movl %edx,%ecx
+        shrl $3,%ecx
+        andl $7,%edx    
+        jz   10f
+1:      rep 
+        movsq 
+        movl %edx,%ecx
+2:      rep
+        movsb
+9:      movl %ecx,%eax
+        ret
+        /* multiple of 8 byte */
+10:     rep
+        movsq
+        xor %eax,%eax
+        ret
+        /* exception handling */
+3:      lea (%rdx,%rcx,8),%rax  /* exception on quad loop */
+        jmp 6f
+5:      movl %ecx,%eax          /* exception on byte loop */
+        /* eax: left over bytes */
+6:      testl %r8d,%r8d         /* zero flag set? */
+        jz 7f
+        movl %eax,%ecx          /* initialize x86 loop counter */
+        push %rax
+        xorl %eax,%eax
+8:      rep
+        stosb                   /* zero the rest */
+11:     pop %rax
+7:      ret
+        CFI_ENDPROC
+END(copy_user_generic_c)
+        .section __ex_table,"a"
+        .quad 1b,3b
+        .quad 2b,5b
+        .quad 8b,11b
+        .quad 10b,3b
+        .previous
author	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 05:17:08 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2007-10-11 05:17:08 -0400
commit	185f3d38900f750a4566f87cde6a178f3595a115 (patch)
tree	d463f6da1af452b1bbdf476828ea88427087f255 /arch/x86/lib/copy_user_64.S
parent	51b2833060f26258ea2da091c7b9c6a358ac9dd2 (diff)

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S new file mode 100644 index 000000000000..70bebd310408 --- /dev/null +++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
	1	/* Copyright 2002 Andi Kleen, SuSE Labs.
	2	* Subject to the GNU Public License v2.
	3	*
	4	* Functions to copy from and to user space.
	5	*/
	6
	7	#include <linux/linkage.h>
	8	#include <asm/dwarf2.h>
	9
	10	#define FIX_ALIGNMENT 1
	11
	12	#include <asm/current.h>
	13	#include <asm/asm-offsets.h>
	14	#include <asm/thread_info.h>
	15	#include <asm/cpufeature.h>
	16
	17	.macro ALTERNATIVE_JUMP feature,orig,alt
	18	0:
	19	.byte 0xe9 /* 32bit jump */
	20	.long \orig-1f /* by default jump to orig */
	21	1:
	22	.section .altinstr_replacement,"ax"
	23	2: .byte 0xe9 /* near jump with 32bit immediate */
	24	.long \alt-1b /* offset / / or alternatively to alt */
	25	.previous
	26	.section .altinstructions,"a"
	27	.align 8
	28	.quad 0b
	29	.quad 2b
	30	.byte \feature /* when feature is set */
	31	.byte 5
	32	.byte 5
	33	.previous
	34	.endm
	35
	36	/* Standard copy_to_user with segment limit checking */
	37	ENTRY(copy_to_user)
	38	CFI_STARTPROC
	39	GET_THREAD_INFO(%rax)
	40	movq %rdi,%rcx
	41	addq %rdx,%rcx
	42	jc bad_to_user
	43	cmpq threadinfo_addr_limit(%rax),%rcx
	44	jae bad_to_user
	45	xorl %eax,%eax /* clear zero flag */
	46	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	47	CFI_ENDPROC
	48
	49	ENTRY(copy_user_generic)
	50	CFI_STARTPROC
	51	movl $1,%ecx /* set zero flag */
	52	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	53	CFI_ENDPROC
	54
	55	ENTRY(__copy_from_user_inatomic)
	56	CFI_STARTPROC
	57	xorl %ecx,%ecx /* clear zero flag */
	58	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	59	CFI_ENDPROC
	60
	61	/* Standard copy_from_user with segment limit checking */
	62	ENTRY(copy_from_user)
	63	CFI_STARTPROC
	64	GET_THREAD_INFO(%rax)
	65	movq %rsi,%rcx
	66	addq %rdx,%rcx
	67	jc bad_from_user
	68	cmpq threadinfo_addr_limit(%rax),%rcx
	69	jae bad_from_user
	70	movl $1,%ecx /* set zero flag */
	71	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
	72	CFI_ENDPROC
	73	ENDPROC(copy_from_user)
	74
	75	.section .fixup,"ax"
	76	/* must zero dest */
	77	bad_from_user:
	78	CFI_STARTPROC
	79	movl %edx,%ecx
	80	xorl %eax,%eax
	81	rep
	82	stosb
	83	bad_to_user:
	84	movl %edx,%eax
	85	ret
	86	CFI_ENDPROC
	87	END(bad_from_user)
	88	.previous
	89
	90
	91	/*
	92	* copy_user_generic_unrolled - memory copy with exception handling.
	93	* This version is for CPUs like P4 that don't have efficient micro code for rep movsq
	94	*
	95	* Input:
	96	* rdi destination
	97	* rsi source
	98	* rdx count
	99	* ecx zero flag -- if true zero destination on error
	100	*
	101	* Output:
	102	* eax uncopied bytes or 0 if successful.
	103	*/
	104	ENTRY(copy_user_generic_unrolled)
	105	CFI_STARTPROC
	106	pushq %rbx
	107	CFI_ADJUST_CFA_OFFSET 8
	108	CFI_REL_OFFSET rbx, 0
	109	pushq %rcx
	110	CFI_ADJUST_CFA_OFFSET 8
	111	CFI_REL_OFFSET rcx, 0
	112	xorl %eax,%eax /zero for the exception handler /
	113
	114	#ifdef FIX_ALIGNMENT
	115	/* check for bad alignment of destination */
	116	movl %edi,%ecx
	117	andl $7,%ecx
	118	jnz .Lbad_alignment
	119	.Lafter_bad_alignment:
	120	#endif
	121
	122	movq %rdx,%rcx
	123
	124	movl $64,%ebx
	125	shrq $6,%rdx
	126	decq %rdx
	127	js .Lhandle_tail
	128
	129	.p2align 4
	130	.Lloop:
	131	.Ls1: movq (%rsi),%r11
	132	.Ls2: movq 1*8(%rsi),%r8
	133	.Ls3: movq 2*8(%rsi),%r9
	134	.Ls4: movq 3*8(%rsi),%r10
	135	.Ld1: movq %r11,(%rdi)
	136	.Ld2: movq %r8,1*8(%rdi)
	137	.Ld3: movq %r9,2*8(%rdi)
	138	.Ld4: movq %r10,3*8(%rdi)
	139
	140	.Ls5: movq 4*8(%rsi),%r11
	141	.Ls6: movq 5*8(%rsi),%r8
	142	.Ls7: movq 6*8(%rsi),%r9
	143	.Ls8: movq 7*8(%rsi),%r10
	144	.Ld5: movq %r11,4*8(%rdi)
	145	.Ld6: movq %r8,5*8(%rdi)
	146	.Ld7: movq %r9,6*8(%rdi)
	147	.Ld8: movq %r10,7*8(%rdi)
	148
	149	decq %rdx
	150
	151	leaq 64(%rsi),%rsi
	152	leaq 64(%rdi),%rdi
	153
	154	jns .Lloop
	155
	156	.p2align 4
	157	.Lhandle_tail:
	158	movl %ecx,%edx
	159	andl $63,%ecx
	160	shrl $3,%ecx
	161	jz .Lhandle_7
	162	movl $8,%ebx
	163	.p2align 4
	164	.Lloop_8:
	165	.Ls9: movq (%rsi),%r8
	166	.Ld9: movq %r8,(%rdi)
	167	decl %ecx
	168	leaq 8(%rdi),%rdi
	169	leaq 8(%rsi),%rsi
	170	jnz .Lloop_8
	171
	172	.Lhandle_7:
	173	movl %edx,%ecx
	174	andl $7,%ecx
	175	jz .Lende
	176	.p2align 4
	177	.Lloop_1:
	178	.Ls10: movb (%rsi),%bl
	179	.Ld10: movb %bl,(%rdi)
	180	incq %rdi
	181	incq %rsi
	182	decl %ecx
	183	jnz .Lloop_1
	184
	185	CFI_REMEMBER_STATE
	186	.Lende:
	187	popq %rcx
	188	CFI_ADJUST_CFA_OFFSET -8
	189	CFI_RESTORE rcx
	190	popq %rbx
	191	CFI_ADJUST_CFA_OFFSET -8
	192	CFI_RESTORE rbx
	193	ret
	194	CFI_RESTORE_STATE
	195
	196	#ifdef FIX_ALIGNMENT
	197	/* align destination */
	198	.p2align 4
	199	.Lbad_alignment:
	200	movl $8,%r9d
	201	subl %ecx,%r9d
	202	movl %r9d,%ecx
	203	cmpq %r9,%rdx
	204	jz .Lhandle_7
	205	js .Lhandle_7
	206	.Lalign_1:
	207	.Ls11: movb (%rsi),%bl
	208	.Ld11: movb %bl,(%rdi)
	209	incq %rsi
	210	incq %rdi
	211	decl %ecx
	212	jnz .Lalign_1
	213	subq %r9,%rdx
	214	jmp .Lafter_bad_alignment
	215	#endif
	216
	217	/* table sorted by exception address */
	218	.section __ex_table,"a"
	219	.align 8
	220	.quad .Ls1,.Ls1e
	221	.quad .Ls2,.Ls2e
	222	.quad .Ls3,.Ls3e
	223	.quad .Ls4,.Ls4e
	224	.quad .Ld1,.Ls1e
	225	.quad .Ld2,.Ls2e
	226	.quad .Ld3,.Ls3e
	227	.quad .Ld4,.Ls4e
	228	.quad .Ls5,.Ls5e
	229	.quad .Ls6,.Ls6e
	230	.quad .Ls7,.Ls7e
	231	.quad .Ls8,.Ls8e
	232	.quad .Ld5,.Ls5e
	233	.quad .Ld6,.Ls6e
	234	.quad .Ld7,.Ls7e
	235	.quad .Ld8,.Ls8e
	236	.quad .Ls9,.Le_quad
	237	.quad .Ld9,.Le_quad
	238	.quad .Ls10,.Le_byte
	239	.quad .Ld10,.Le_byte
	240	#ifdef FIX_ALIGNMENT
	241	.quad .Ls11,.Lzero_rest
	242	.quad .Ld11,.Lzero_rest
	243	#endif
	244	.quad .Le5,.Le_zero
	245	.previous
	246
	247	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
	248	pessimistic side. this is gross. it would be better to fix the
	249	interface. */
	250	/* eax: zero, ebx: 64 */
	251	.Ls1e: addl $8,%eax
	252	.Ls2e: addl $8,%eax
	253	.Ls3e: addl $8,%eax
	254	.Ls4e: addl $8,%eax
	255	.Ls5e: addl $8,%eax
	256	.Ls6e: addl $8,%eax
	257	.Ls7e: addl $8,%eax
	258	.Ls8e: addl $8,%eax
	259	addq %rbx,%rdi /* +64 */
	260	subq %rax,%rdi /* correct destination with computed offset */
	261
	262	shlq $6,%rdx /* loop counter * 64 (stride length) */
	263	addq %rax,%rdx /* add offset to loopcnt */
	264	andl $63,%ecx /* remaining bytes */
	265	addq %rcx,%rdx /* add them */
	266	jmp .Lzero_rest
	267
	268	/* exception on quad word loop in tail handling */
	269	/* ecx: loopcnt/8, %edx: length, rdi: correct */
	270	.Le_quad:
	271	shll $3,%ecx
	272	andl $7,%edx
	273	addl %ecx,%edx
	274	/* edx: bytes to zero, rdi: dest, eax:zero */
	275	.Lzero_rest:
	276	cmpl $0,(%rsp)
	277	jz .Le_zero
	278	movq %rdx,%rcx
	279	.Le_byte:
	280	xorl %eax,%eax
	281	.Le5: rep
	282	stosb
	283	/* when there is another exception while zeroing the rest just return */
	284	.Le_zero:
	285	movq %rdx,%rax
	286	jmp .Lende
	287	CFI_ENDPROC
	288	ENDPROC(copy_user_generic)
	289
	290
	291	/* Some CPUs run faster using the string copy instructions.
	292	This is also a lot simpler. Use them when possible.
	293	Patch in jmps to this code instead of copying it fully
	294	to avoid unwanted aliasing in the exception tables. */
	295
	296	/* rdi destination
	297	* rsi source
	298	* rdx count
	299	* ecx zero flag
	300	*
	301	* Output:
	302	* eax uncopied bytes or 0 if successfull.
	303	*
	304	* Only 4GB of copy is supported. This shouldn't be a problem
	305	* because the kernel normally only writes from/to page sized chunks
	306	* even if user space passed a longer buffer.
	307	* And more would be dangerous because both Intel and AMD have
	308	* errata with rep movsq > 4GB. If someone feels the need to fix
	309	* this please consider this.
	310	*/
	311	ENTRY(copy_user_generic_string)
	312	CFI_STARTPROC
	313	movl %ecx,%r8d /* save zero flag */
	314	movl %edx,%ecx
	315	shrl $3,%ecx
	316	andl $7,%edx
	317	jz 10f
	318	1: rep
	319	movsq
	320	movl %edx,%ecx
	321	2: rep
	322	movsb
	323	9: movl %ecx,%eax
	324	ret
	325
	326	/* multiple of 8 byte */
	327	10: rep
	328	movsq
	329	xor %eax,%eax
	330	ret
	331
	332	/* exception handling */
	333	3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
	334	jmp 6f
	335	5: movl %ecx,%eax /* exception on byte loop */
	336	/* eax: left over bytes */
	337	6: testl %r8d,%r8d /* zero flag set? */
	338	jz 7f
	339	movl %eax,%ecx /* initialize x86 loop counter */
	340	push %rax
	341	xorl %eax,%eax
	342	8: rep
	343	stosb /* zero the rest */
	344	11: pop %rax
	345	7: ret
	346	CFI_ENDPROC
	347	END(copy_user_generic_c)
	348
	349	.section __ex_table,"a"
	350	.quad 1b,3b
	351	.quad 2b,5b
	352	.quad 8b,11b
	353	.quad 10b,3b
	354	.previous