aboutsummaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorAndi Kleen <ak@suse.de>2007-02-13 07:26:19 -0500
committerAndi Kleen <andi@basil.nowhere.org>2007-02-13 07:26:19 -0500
commit0812a579c92fefa57506821fa08e90f47cb6dbdd (patch)
tree53607d77021e5b9f5adf60f2d0bfbdd551f63c8e /arch
parentee55c0be30429d7c3e61fa26c7f7e323c80e14f0 (diff)
[PATCH] x86-64: Add __copy_from_user_nocache
This does user copies in fs write() into the page cache with write combining. This pushes the destination out of the CPU's cache, but allows higher bandwidth in some case. The theory is that the page cache data is usually not touched by the CPU again and it's better to not pollute the cache with it. Also it is a little faster. Signed-off-by: Andi Kleen <ak@suse.de>
Diffstat (limited to 'arch')
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c1
-rw-r--r--arch/x86_64/lib/Makefile2
-rw-r--r--arch/x86_64/lib/copy_user_nocache.S217
3 files changed, 219 insertions, 1 deletions
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 6d77e4797a47..23a7da312f30 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -26,6 +26,7 @@ EXPORT_SYMBOL(__put_user_4);
26EXPORT_SYMBOL(__put_user_8); 26EXPORT_SYMBOL(__put_user_8);
27 27
28EXPORT_SYMBOL(copy_user_generic); 28EXPORT_SYMBOL(copy_user_generic);
29EXPORT_SYMBOL(__copy_user_nocache);
29EXPORT_SYMBOL(copy_from_user); 30EXPORT_SYMBOL(copy_from_user);
30EXPORT_SYMBOL(copy_to_user); 31EXPORT_SYMBOL(copy_to_user);
31EXPORT_SYMBOL(__copy_from_user_inatomic); 32EXPORT_SYMBOL(__copy_from_user_inatomic);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index b78d4170fce2..8d5f835af481 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -9,4 +9,4 @@ obj-y := io.o iomap_copy.o
9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \ 9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
10 usercopy.o getuser.o putuser.o \ 10 usercopy.o getuser.o putuser.o \
11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o 11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o
12lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o 12lib-y += memcpy.o memmove.o memset.o copy_user.o rwlock.o copy_user_nocache.o
diff --git a/arch/x86_64/lib/copy_user_nocache.S b/arch/x86_64/lib/copy_user_nocache.S
new file mode 100644
index 000000000000..4620efb12f13
--- /dev/null
+++ b/arch/x86_64/lib/copy_user_nocache.S
@@ -0,0 +1,217 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2.
3 *
4 * Functions to copy from and to user space.
5 */
6
7#include <linux/linkage.h>
8#include <asm/dwarf2.h>
9
10#define FIX_ALIGNMENT 1
11
12#include <asm/current.h>
13#include <asm/asm-offsets.h>
14#include <asm/thread_info.h>
15#include <asm/cpufeature.h>
16
17/*
18 * copy_user_nocache - Uncached memory copy with exception handling
19 * This will force destination/source out of cache for more performance.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * rdx count
25 * rcx zero flag when 1 zero on exception
26 *
27 * Output:
28 * eax uncopied bytes or 0 if successful.
29 */
30ENTRY(__copy_user_nocache)
31 CFI_STARTPROC
32 pushq %rbx
33 CFI_ADJUST_CFA_OFFSET 8
34 CFI_REL_OFFSET rbx, 0
35 pushq %rcx /* save zero flag */
36 CFI_ADJUST_CFA_OFFSET 8
37 CFI_REL_OFFSET rcx, 0
38
39 xorl %eax,%eax /* zero for the exception handler */
40
41#ifdef FIX_ALIGNMENT
42 /* check for bad alignment of destination */
43 movl %edi,%ecx
44 andl $7,%ecx
45 jnz .Lbad_alignment
46.Lafter_bad_alignment:
47#endif
48
49 movq %rdx,%rcx
50
51 movl $64,%ebx
52 shrq $6,%rdx
53 decq %rdx
54 js .Lhandle_tail
55
56 .p2align 4
57.Lloop:
58.Ls1: movq (%rsi),%r11
59.Ls2: movq 1*8(%rsi),%r8
60.Ls3: movq 2*8(%rsi),%r9
61.Ls4: movq 3*8(%rsi),%r10
62.Ld1: movnti %r11,(%rdi)
63.Ld2: movnti %r8,1*8(%rdi)
64.Ld3: movnti %r9,2*8(%rdi)
65.Ld4: movnti %r10,3*8(%rdi)
66
67.Ls5: movq 4*8(%rsi),%r11
68.Ls6: movq 5*8(%rsi),%r8
69.Ls7: movq 6*8(%rsi),%r9
70.Ls8: movq 7*8(%rsi),%r10
71.Ld5: movnti %r11,4*8(%rdi)
72.Ld6: movnti %r8,5*8(%rdi)
73.Ld7: movnti %r9,6*8(%rdi)
74.Ld8: movnti %r10,7*8(%rdi)
75
76 dec %rdx
77
78 leaq 64(%rsi),%rsi
79 leaq 64(%rdi),%rdi
80
81 jns .Lloop
82
83 .p2align 4
84.Lhandle_tail:
85 movl %ecx,%edx
86 andl $63,%ecx
87 shrl $3,%ecx
88 jz .Lhandle_7
89 movl $8,%ebx
90 .p2align 4
91.Lloop_8:
92.Ls9: movq (%rsi),%r8
93.Ld9: movnti %r8,(%rdi)
94 decl %ecx
95 leaq 8(%rdi),%rdi
96 leaq 8(%rsi),%rsi
97 jnz .Lloop_8
98
99.Lhandle_7:
100 movl %edx,%ecx
101 andl $7,%ecx
102 jz .Lende
103 .p2align 4
104.Lloop_1:
105.Ls10: movb (%rsi),%bl
106.Ld10: movb %bl,(%rdi)
107 incq %rdi
108 incq %rsi
109 decl %ecx
110 jnz .Lloop_1
111
112 CFI_REMEMBER_STATE
113.Lende:
114 popq %rcx
115 CFI_ADJUST_CFA_OFFSET -8
116 CFI_RESTORE %rcx
117 popq %rbx
118 CFI_ADJUST_CFA_OFFSET -8
119 CFI_RESTORE rbx
120 ret
121 CFI_RESTORE_STATE
122
123#ifdef FIX_ALIGNMENT
124 /* align destination */
125 .p2align 4
126.Lbad_alignment:
127 movl $8,%r9d
128 subl %ecx,%r9d
129 movl %r9d,%ecx
130 cmpq %r9,%rdx
131 jz .Lhandle_7
132 js .Lhandle_7
133.Lalign_1:
134.Ls11: movb (%rsi),%bl
135.Ld11: movb %bl,(%rdi)
136 incq %rsi
137 incq %rdi
138 decl %ecx
139 jnz .Lalign_1
140 subq %r9,%rdx
141 jmp .Lafter_bad_alignment
142#endif
143
144 /* table sorted by exception address */
145 .section __ex_table,"a"
146 .align 8
147 .quad .Ls1,.Ls1e
148 .quad .Ls2,.Ls2e
149 .quad .Ls3,.Ls3e
150 .quad .Ls4,.Ls4e
151 .quad .Ld1,.Ls1e
152 .quad .Ld2,.Ls2e
153 .quad .Ld3,.Ls3e
154 .quad .Ld4,.Ls4e
155 .quad .Ls5,.Ls5e
156 .quad .Ls6,.Ls6e
157 .quad .Ls7,.Ls7e
158 .quad .Ls8,.Ls8e
159 .quad .Ld5,.Ls5e
160 .quad .Ld6,.Ls6e
161 .quad .Ld7,.Ls7e
162 .quad .Ld8,.Ls8e
163 .quad .Ls9,.Le_quad
164 .quad .Ld9,.Le_quad
165 .quad .Ls10,.Le_byte
166 .quad .Ld10,.Le_byte
167#ifdef FIX_ALIGNMENT
168 .quad .Ls11,.Lzero_rest
169 .quad .Ld11,.Lzero_rest
170#endif
171 .quad .Le5,.Le_zero
172 .previous
173
174 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
175 pessimistic side. this is gross. it would be better to fix the
176 interface. */
177 /* eax: zero, ebx: 64 */
178.Ls1e: addl $8,%eax
179.Ls2e: addl $8,%eax
180.Ls3e: addl $8,%eax
181.Ls4e: addl $8,%eax
182.Ls5e: addl $8,%eax
183.Ls6e: addl $8,%eax
184.Ls7e: addl $8,%eax
185.Ls8e: addl $8,%eax
186 addq %rbx,%rdi /* +64 */
187 subq %rax,%rdi /* correct destination with computed offset */
188
189 shlq $6,%rdx /* loop counter * 64 (stride length) */
190 addq %rax,%rdx /* add offset to loopcnt */
191 andl $63,%ecx /* remaining bytes */
192 addq %rcx,%rdx /* add them */
193 jmp .Lzero_rest
194
195 /* exception on quad word loop in tail handling */
196 /* ecx: loopcnt/8, %edx: length, rdi: correct */
197.Le_quad:
198 shll $3,%ecx
199 andl $7,%edx
200 addl %ecx,%edx
201 /* edx: bytes to zero, rdi: dest, eax:zero */
202.Lzero_rest:
203 cmpl $0,(%rsp) /* zero flag set? */
204 jz .Le_zero
205 movq %rdx,%rcx
206.Le_byte:
207 xorl %eax,%eax
208.Le5: rep
209 stosb
210 /* when there is another exception while zeroing the rest just return */
211.Le_zero:
212 movq %rdx,%rax
213 jmp .Lende
214 CFI_ENDPROC
215ENDPROC(__copy_user_nocache)
216
217