[PATCH] x86-64: Add __copy_from_user_nocache

This does user copies in fs write() into the page cache with write combining. This pushes the destination out of the CPU's cache, but allows higher bandwidth in some case. The theory is that the page cache data is usually not touched by the CPU again and it's better to not pollute the cache with it. Also it is a little faster. Signed-off-by: Andi Kleen <ak@suse.de>
author: Andi Kleen <ak@suse.de> 2007-02-13 07:26:19 -0500
committer: Andi Kleen <andi@basil.nowhere.org> 2007-02-13 07:26:19 -0500
commit: 0812a579c92fefa57506821fa08e90f47cb6dbdd (patch)
tree: 53607d77021e5b9f5adf60f2d0bfbdd551f63c8e /arch/x86_64
parent: ee55c0be30429d7c3e61fa26c7f7e323c80e14f0 (diff)
3 files changed, 219 insertions, 1 deletions
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 6d77e4797a4..23a7da312f3 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
 EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(copy_user_generic);
+EXPORT_SYMBOL(__copy_user_nocache);
 EXPORT_SYMBOL(copy_from_user);
 EXPORT_SYMBOL(copy_to_user);
 EXPORT_SYMBOL(__copy_from_user_inatomic);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index b78d4170fce..8d5f835af48 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
 lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
        usercopy.o getuser.o putuser.o  \
        thunk.o clear_page.o copy_page.o bitstr.o bitops.o
-lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o
+lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S
new file mode 100644
index 00000000000..4620efb12f1
--- /dev/null
+++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#define FIX_ALIGNMENT 1
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag        when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+        CFI_STARTPROC
+        pushq %rbx
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rbx, 0
+        pushq %rcx              /* save zero flag */
+        CFI_ADJUST_CFA_OFFSET 8
+        CFI_REL_OFFSET rcx, 0
+        xorl %eax,%eax          /* zero for the exception handler */
+#ifdef FIX_ALIGNMENT
+        /* check for bad alignment of destination */
+        movl %edi,%ecx
+        andl $7,%ecx
+        jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+        movq %rdx,%rcx
+        movl $64,%ebx
+        shrq $6,%rdx
+        decq %rdx
+        js   .Lhandle_tail
+        .p2align 4
+.Lloop:
+.Ls1:   movq (%rsi),%r11
+.Ls2:   movq 1*8(%rsi),%r8
+.Ls3:   movq 2*8(%rsi),%r9
+.Ls4:   movq 3*8(%rsi),%r10
+.Ld1:   movnti %r11,(%rdi)
+.Ld2:   movnti %r8,1*8(%rdi)
+.Ld3:   movnti %r9,2*8(%rdi)
+.Ld4:   movnti %r10,3*8(%rdi)
+.Ls5:   movq 4*8(%rsi),%r11
+.Ls6:   movq 5*8(%rsi),%r8
+.Ls7:   movq 6*8(%rsi),%r9
+.Ls8:   movq 7*8(%rsi),%r10
+.Ld5:   movnti %r11,4*8(%rdi)
+.Ld6:   movnti %r8,5*8(%rdi)
+.Ld7:   movnti %r9,6*8(%rdi)
+.Ld8:   movnti %r10,7*8(%rdi)
+        dec  %rdx
+        leaq 64(%rsi),%rsi
+        leaq 64(%rdi),%rdi
+        jns  .Lloop
+        .p2align 4
+.Lhandle_tail:
+        movl %ecx,%edx
+        andl $63,%ecx
+        shrl $3,%ecx
+        jz   .Lhandle_7
+        movl $8,%ebx
+        .p2align 4
+.Lloop_8:
+.Ls9:   movq (%rsi),%r8
+.Ld9:   movnti %r8,(%rdi)
+        decl %ecx
+        leaq 8(%rdi),%rdi
+        leaq 8(%rsi),%rsi
+        jnz .Lloop_8
+.Lhandle_7:
+        movl %edx,%ecx
+        andl $7,%ecx
+        jz   .Lende
+        .p2align 4
+.Lloop_1:
+.Ls10:  movb (%rsi),%bl
+.Ld10:  movb %bl,(%rdi)
+        incq %rdi
+        incq %rsi
+        decl %ecx
+        jnz .Lloop_1
+        CFI_REMEMBER_STATE
+.Lende:
+        popq %rcx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE %rcx
+        popq %rbx
+        CFI_ADJUST_CFA_OFFSET -8
+        CFI_RESTORE rbx
+        ret
+        CFI_RESTORE_STATE
+#ifdef FIX_ALIGNMENT
+        /* align destination */
+        .p2align 4
+.Lbad_alignment:
+        movl $8,%r9d
+        subl %ecx,%r9d
+        movl %r9d,%ecx
+        cmpq %r9,%rdx
+        jz   .Lhandle_7
+        js   .Lhandle_7
+.Lalign_1:
+.Ls11:  movb (%rsi),%bl
+.Ld11:  movb %bl,(%rdi)
+        incq %rsi
+        incq %rdi
+        decl %ecx
+        jnz .Lalign_1
+        subq %r9,%rdx
+        jmp .Lafter_bad_alignment
+#endif
+        /* table sorted by exception address */
+        .section __ex_table,"a"
+        .align 8
+        .quad .Ls1,.Ls1e
+        .quad .Ls2,.Ls2e
+        .quad .Ls3,.Ls3e
+        .quad .Ls4,.Ls4e
+        .quad .Ld1,.Ls1e
+        .quad .Ld2,.Ls2e
+        .quad .Ld3,.Ls3e
+        .quad .Ld4,.Ls4e
+        .quad .Ls5,.Ls5e
+        .quad .Ls6,.Ls6e
+        .quad .Ls7,.Ls7e
+        .quad .Ls8,.Ls8e
+        .quad .Ld5,.Ls5e
+        .quad .Ld6,.Ls6e
+        .quad .Ld7,.Ls7e
+        .quad .Ld8,.Ls8e
+        .quad .Ls9,.Le_quad
+        .quad .Ld9,.Le_quad
+        .quad .Ls10,.Le_byte
+        .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+        .quad .Ls11,.Lzero_rest
+        .quad .Ld11,.Lzero_rest
+#endif
+        .quad .Le5,.Le_zero
+        .previous
+        /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+           pessimistic side. this is gross. it would be better to fix the
+           interface. */
+        /* eax: zero, ebx: 64 */
+.Ls1e:  addl $8,%eax
+.Ls2e:  addl $8,%eax
+.Ls3e:  addl $8,%eax
+.Ls4e:  addl $8,%eax
+.Ls5e:  addl $8,%eax
+.Ls6e:  addl $8,%eax
+.Ls7e:  addl $8,%eax
+.Ls8e:  addl $8,%eax
+        addq %rbx,%rdi  /* +64 */
+        subq %rax,%rdi  /* correct destination with computed offset */
+        shlq $6,%rdx    /* loop counter * 64 (stride length) */
+        addq %rax,%rdx  /* add offset to loopcnt */
+        andl $63,%ecx   /* remaining bytes */
+        addq %rcx,%rdx  /* add them */
+        jmp .Lzero_rest
+        /* exception on quad word loop in tail handling */
+        /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+        shll $3,%ecx
+        andl $7,%edx
+        addl %ecx,%edx
+        /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+        cmpl $0,(%rsp)  /* zero flag set? */
+        jz   .Le_zero
+        movq %rdx,%rcx
+.Le_byte:
+        xorl %eax,%eax
+.Le5:   rep
+        stosb
+        /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+        movq %rdx,%rax
+        jmp .Lende
+        CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
author	Andi Kleen <ak@suse.de>	2007-02-13 07:26:19 -0500
committer	Andi Kleen <andi@basil.nowhere.org>	2007-02-13 07:26:19 -0500
commit	0812a579c92fefa57506821fa08e90f47cb6dbdd (patch)
tree	53607d77021e5b9f5adf60f2d0bfbdd551f63c8e /arch/x86_64
parent	ee55c0be30429d7c3e61fa26c7f7e323c80e14f0 (diff)

diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 6d77e4797a4..23a7da312f3 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
26	EXPORT_SYMBOL(__put_user_8);	26	EXPORT_SYMBOL(__put_user_8);
27		27
28	EXPORT_SYMBOL(copy_user_generic);	28	EXPORT_SYMBOL(copy_user_generic);
		29	EXPORT_SYMBOL(__copy_user_nocache);
29	EXPORT_SYMBOL(copy_from_user);	30	EXPORT_SYMBOL(copy_from_user);
30	EXPORT_SYMBOL(copy_to_user);	31	EXPORT_SYMBOL(copy_to_user);
31	EXPORT_SYMBOL(__copy_from_user_inatomic);	32	EXPORT_SYMBOL(__copy_from_user_inatomic);


diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index b78d4170fce..8d5f835af48 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
9	lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \	9	lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
10	usercopy.o getuser.o putuser.o \	10	usercopy.o getuser.o putuser.o \
11	thunk.o clear_page.o copy_page.o bitstr.o bitops.o	11	thunk.o clear_page.o copy_page.o bitstr.o bitops.o
12	lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o	12	lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o


diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S new file mode 100644 index 00000000000..4620efb12f1 --- /dev/null +++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
		1	/* Copyright 2002 Andi Kleen, SuSE Labs.
		2	* Subject to the GNU Public License v2.
		3	*
		4	* Functions to copy from and to user space.
		5	*/
		6
		7	#include <linux/linkage.h>
		8	#include <asm/dwarf2.h>
		9
		10	#define FIX_ALIGNMENT 1
		11
		12	#include <asm/current.h>
		13	#include <asm/asm-offsets.h>
		14	#include <asm/thread_info.h>
		15	#include <asm/cpufeature.h>
		16
		17	/*
		18	* copy_user_nocache - Uncached memory copy with exception handling
		19	* This will force destination/source out of cache for more performance.
		20	*
		21	* Input:
		22	* rdi destination
		23	* rsi source
		24	* rdx count
		25	* rcx zero flag when 1 zero on exception
		26	*
		27	* Output:
		28	* eax uncopied bytes or 0 if successful.
		29	*/
		30	ENTRY(__copy_user_nocache)
		31	CFI_STARTPROC
		32	pushq %rbx
		33	CFI_ADJUST_CFA_OFFSET 8
		34	CFI_REL_OFFSET rbx, 0
		35	pushq %rcx /* save zero flag */
		36	CFI_ADJUST_CFA_OFFSET 8
		37	CFI_REL_OFFSET rcx, 0
		38
		39	xorl %eax,%eax /* zero for the exception handler */
		40
		41	#ifdef FIX_ALIGNMENT
		42	/* check for bad alignment of destination */
		43	movl %edi,%ecx
		44	andl $7,%ecx
		45	jnz .Lbad_alignment
		46	.Lafter_bad_alignment:
		47	#endif
		48
		49	movq %rdx,%rcx
		50
		51	movl $64,%ebx
		52	shrq $6,%rdx
		53	decq %rdx
		54	js .Lhandle_tail
		55
		56	.p2align 4
		57	.Lloop:
		58	.Ls1: movq (%rsi),%r11
		59	.Ls2: movq 1*8(%rsi),%r8
		60	.Ls3: movq 2*8(%rsi),%r9
		61	.Ls4: movq 3*8(%rsi),%r10
		62	.Ld1: movnti %r11,(%rdi)
		63	.Ld2: movnti %r8,1*8(%rdi)
		64	.Ld3: movnti %r9,2*8(%rdi)
		65	.Ld4: movnti %r10,3*8(%rdi)
		66
		67	.Ls5: movq 4*8(%rsi),%r11
		68	.Ls6: movq 5*8(%rsi),%r8
		69	.Ls7: movq 6*8(%rsi),%r9
		70	.Ls8: movq 7*8(%rsi),%r10
		71	.Ld5: movnti %r11,4*8(%rdi)
		72	.Ld6: movnti %r8,5*8(%rdi)
		73	.Ld7: movnti %r9,6*8(%rdi)
		74	.Ld8: movnti %r10,7*8(%rdi)
		75
		76	dec %rdx
		77
		78	leaq 64(%rsi),%rsi
		79	leaq 64(%rdi),%rdi
		80
		81	jns .Lloop
		82
		83	.p2align 4
		84	.Lhandle_tail:
		85	movl %ecx,%edx
		86	andl $63,%ecx
		87	shrl $3,%ecx
		88	jz .Lhandle_7
		89	movl $8,%ebx
		90	.p2align 4
		91	.Lloop_8:
		92	.Ls9: movq (%rsi),%r8
		93	.Ld9: movnti %r8,(%rdi)
		94	decl %ecx
		95	leaq 8(%rdi),%rdi
		96	leaq 8(%rsi),%rsi
		97	jnz .Lloop_8
		98
		99	.Lhandle_7:
		100	movl %edx,%ecx
		101	andl $7,%ecx
		102	jz .Lende
		103	.p2align 4
		104	.Lloop_1:
		105	.Ls10: movb (%rsi),%bl
		106	.Ld10: movb %bl,(%rdi)
		107	incq %rdi
		108	incq %rsi
		109	decl %ecx
		110	jnz .Lloop_1
		111
		112	CFI_REMEMBER_STATE
		113	.Lende:
		114	popq %rcx
		115	CFI_ADJUST_CFA_OFFSET -8
		116	CFI_RESTORE %rcx
		117	popq %rbx
		118	CFI_ADJUST_CFA_OFFSET -8
		119	CFI_RESTORE rbx
		120	ret
		121	CFI_RESTORE_STATE
		122
		123	#ifdef FIX_ALIGNMENT
		124	/* align destination */
		125	.p2align 4
		126	.Lbad_alignment:
		127	movl $8,%r9d
		128	subl %ecx,%r9d
		129	movl %r9d,%ecx
		130	cmpq %r9,%rdx
		131	jz .Lhandle_7
		132	js .Lhandle_7
		133	.Lalign_1:
		134	.Ls11: movb (%rsi),%bl
		135	.Ld11: movb %bl,(%rdi)
		136	incq %rsi
		137	incq %rdi
		138	decl %ecx
		139	jnz .Lalign_1
		140	subq %r9,%rdx
		141	jmp .Lafter_bad_alignment
		142	#endif
		143
		144	/* table sorted by exception address */
		145	.section __ex_table,"a"
		146	.align 8
		147	.quad .Ls1,.Ls1e
		148	.quad .Ls2,.Ls2e
		149	.quad .Ls3,.Ls3e
		150	.quad .Ls4,.Ls4e
		151	.quad .Ld1,.Ls1e
		152	.quad .Ld2,.Ls2e
		153	.quad .Ld3,.Ls3e
		154	.quad .Ld4,.Ls4e
		155	.quad .Ls5,.Ls5e
		156	.quad .Ls6,.Ls6e
		157	.quad .Ls7,.Ls7e
		158	.quad .Ls8,.Ls8e
		159	.quad .Ld5,.Ls5e
		160	.quad .Ld6,.Ls6e
		161	.quad .Ld7,.Ls7e
		162	.quad .Ld8,.Ls8e
		163	.quad .Ls9,.Le_quad
		164	.quad .Ld9,.Le_quad
		165	.quad .Ls10,.Le_byte
		166	.quad .Ld10,.Le_byte
		167	#ifdef FIX_ALIGNMENT
		168	.quad .Ls11,.Lzero_rest
		169	.quad .Ld11,.Lzero_rest
		170	#endif
		171	.quad .Le5,.Le_zero
		172	.previous
		173
		174	/* compute 64-offset for main loop. 8 bytes accuracy with error on the
		175	pessimistic side. this is gross. it would be better to fix the
		176	interface. */
		177	/* eax: zero, ebx: 64 */
		178	.Ls1e: addl $8,%eax
		179	.Ls2e: addl $8,%eax
		180	.Ls3e: addl $8,%eax
		181	.Ls4e: addl $8,%eax
		182	.Ls5e: addl $8,%eax
		183	.Ls6e: addl $8,%eax
		184	.Ls7e: addl $8,%eax
		185	.Ls8e: addl $8,%eax
		186	addq %rbx,%rdi /* +64 */
		187	subq %rax,%rdi /* correct destination with computed offset */
		188
		189	shlq $6,%rdx /* loop counter * 64 (stride length) */
		190	addq %rax,%rdx /* add offset to loopcnt */
		191	andl $63,%ecx /* remaining bytes */
		192	addq %rcx,%rdx /* add them */
		193	jmp .Lzero_rest
		194
		195	/* exception on quad word loop in tail handling */
		196	/* ecx: loopcnt/8, %edx: length, rdi: correct */
		197	.Le_quad:
		198	shll $3,%ecx
		199	andl $7,%edx
		200	addl %ecx,%edx
		201	/* edx: bytes to zero, rdi: dest, eax:zero */
		202	.Lzero_rest:
		203	cmpl $0,(%rsp) /* zero flag set? */
		204	jz .Le_zero
		205	movq %rdx,%rcx
		206	.Le_byte:
		207	xorl %eax,%eax
		208	.Le5: rep
		209	stosb
		210	/* when there is another exception while zeroing the rest just return */
		211	.Le_zero:
		212	movq %rdx,%rax
		213	jmp .Lende
		214	CFI_ENDPROC
		215	ENDPROC(__copy_user_nocache)
		216
		217