aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86_64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64/lib')
-rw-r--r--arch/x86_64/lib/Makefile14
-rw-r--r--arch/x86_64/lib/bitops.c141
-rw-r--r--arch/x86_64/lib/bitstr.c28
-rw-r--r--arch/x86_64/lib/clear_page.S50
-rw-r--r--arch/x86_64/lib/copy_page.S101
-rw-r--r--arch/x86_64/lib/copy_user.S294
-rw-r--r--arch/x86_64/lib/csum-copy.S233
-rw-r--r--arch/x86_64/lib/csum-partial.c150
-rw-r--r--arch/x86_64/lib/csum-wrappers.c129
-rw-r--r--arch/x86_64/lib/dec_and_lock.c40
-rw-r--r--arch/x86_64/lib/delay.c48
-rw-r--r--arch/x86_64/lib/getuser.S101
-rw-r--r--arch/x86_64/lib/io.c23
-rw-r--r--arch/x86_64/lib/memcpy.S121
-rw-r--r--arch/x86_64/lib/memmove.c19
-rw-r--r--arch/x86_64/lib/memset.S125
-rw-r--r--arch/x86_64/lib/putuser.S89
-rw-r--r--arch/x86_64/lib/thunk.S95
-rw-r--r--arch/x86_64/lib/usercopy.c153
19 files changed, 1954 insertions, 0 deletions
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
new file mode 100644
index 000000000000..6b26a1c1e9ff
--- /dev/null
+++ b/arch/x86_64/lib/Makefile
@@ -0,0 +1,14 @@
1#
2# Makefile for x86_64-specific library files.
3#
4
5CFLAGS_csum-partial.o := -funroll-loops
6
7obj-y := io.o
8
9lib-y := csum-partial.o csum-copy.o csum-wrappers.o delay.o \
10 usercopy.o getuser.o putuser.o \
11 thunk.o clear_page.o copy_page.o bitstr.o bitops.o
12lib-y += memcpy.o memmove.o memset.o copy_user.o
13
14lib-$(CONFIG_HAVE_DEC_LOCK) += dec_and_lock.o
diff --git a/arch/x86_64/lib/bitops.c b/arch/x86_64/lib/bitops.c
new file mode 100644
index 000000000000..a29fb75b33ac
--- /dev/null
+++ b/arch/x86_64/lib/bitops.c
@@ -0,0 +1,141 @@
1#include <linux/bitops.h>
2
3#undef find_first_zero_bit
4#undef find_next_zero_bit
5#undef find_first_bit
6#undef find_next_bit
7
8/**
9 * find_first_zero_bit - find the first zero bit in a memory region
10 * @addr: The address to start the search at
11 * @size: The maximum size to search
12 *
13 * Returns the bit-number of the first zero bit, not the number of the byte
14 * containing a bit.
15 */
16inline long find_first_zero_bit(const unsigned long * addr, unsigned long size)
17{
18 long d0, d1, d2;
19 long res;
20
21 if (!size)
22 return 0;
23 asm volatile(
24 " repe; scasq\n"
25 " je 1f\n"
26 " xorq -8(%%rdi),%%rax\n"
27 " subq $8,%%rdi\n"
28 " bsfq %%rax,%%rdx\n"
29 "1: subq %[addr],%%rdi\n"
30 " shlq $3,%%rdi\n"
31 " addq %%rdi,%%rdx"
32 :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
33 :"0" (0ULL), "1" ((size + 63) >> 6), "2" (addr), "3" (-1ULL),
34 [addr] "r" (addr) : "memory");
35 return res;
36}
37
38/**
39 * find_next_zero_bit - find the first zero bit in a memory region
40 * @addr: The address to base the search on
41 * @offset: The bitnumber to start searching at
42 * @size: The maximum size to search
43 */
44long find_next_zero_bit (const unsigned long * addr, long size, long offset)
45{
46 unsigned long * p = ((unsigned long *) addr) + (offset >> 6);
47 unsigned long set = 0;
48 unsigned long res, bit = offset&63;
49
50 if (bit) {
51 /*
52 * Look for zero in first word
53 */
54 asm("bsfq %1,%0\n\t"
55 "cmoveq %2,%0"
56 : "=r" (set)
57 : "r" (~(*p >> bit)), "r"(64L));
58 if (set < (64 - bit))
59 return set + offset;
60 set = 64 - bit;
61 p++;
62 }
63 /*
64 * No zero yet, search remaining full words for a zero
65 */
66 res = find_first_zero_bit ((const unsigned long *)p,
67 size - 64 * (p - (unsigned long *) addr));
68 return (offset + set + res);
69}
70
71static inline long
72__find_first_bit(const unsigned long * addr, unsigned long size)
73{
74 long d0, d1;
75 long res;
76
77 asm volatile(
78 " repe; scasq\n"
79 " jz 1f\n"
80 " subq $8,%%rdi\n"
81 " bsfq (%%rdi),%%rax\n"
82 "1: subq %[addr],%%rdi\n"
83 " shlq $3,%%rdi\n"
84 " addq %%rdi,%%rax"
85 :"=a" (res), "=&c" (d0), "=&D" (d1)
86 :"0" (0ULL),
87 "1" ((size + 63) >> 6), "2" (addr),
88 [addr] "r" (addr) : "memory");
89 return res;
90}
91
92/**
93 * find_first_bit - find the first set bit in a memory region
94 * @addr: The address to start the search at
95 * @size: The maximum size to search
96 *
97 * Returns the bit-number of the first set bit, not the number of the byte
98 * containing a bit.
99 */
100long find_first_bit(const unsigned long * addr, unsigned long size)
101{
102 return __find_first_bit(addr,size);
103}
104
105/**
106 * find_next_bit - find the first set bit in a memory region
107 * @addr: The address to base the search on
108 * @offset: The bitnumber to start searching at
109 * @size: The maximum size to search
110 */
111long find_next_bit(const unsigned long * addr, long size, long offset)
112{
113 const unsigned long * p = addr + (offset >> 6);
114 unsigned long set = 0, bit = offset & 63, res;
115
116 if (bit) {
117 /*
118 * Look for nonzero in the first 64 bits:
119 */
120 asm("bsfq %1,%0\n\t"
121 "cmoveq %2,%0\n\t"
122 : "=r" (set)
123 : "r" (*p >> bit), "r" (64L));
124 if (set < (64 - bit))
125 return set + offset;
126 set = 64 - bit;
127 p++;
128 }
129 /*
130 * No set bit yet, search remaining full words for a bit
131 */
132 res = __find_first_bit (p, size - 64 * (p - addr));
133 return (offset + set + res);
134}
135
136#include <linux/module.h>
137
138EXPORT_SYMBOL(find_next_bit);
139EXPORT_SYMBOL(find_first_bit);
140EXPORT_SYMBOL(find_first_zero_bit);
141EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86_64/lib/bitstr.c b/arch/x86_64/lib/bitstr.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86_64/lib/bitstr.c
@@ -0,0 +1,28 @@
1#include <linux/module.h>
2#include <linux/bitops.h>
3
4/* Find string of zero bits in a bitmap */
5unsigned long
6find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
7{
8 unsigned long n, end, i;
9
10 again:
11 n = find_next_zero_bit(bitmap, nbits, start);
12 if (n == -1)
13 return -1;
14
15 /* could test bitsliced, but it's hardly worth it */
16 end = n+len;
17 if (end >= nbits)
18 return -1;
19 for (i = n+1; i < end; i++) {
20 if (test_bit(i, bitmap)) {
21 start = i+1;
22 goto again;
23 }
24 }
25 return n;
26}
27
28EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86_64/lib/clear_page.S b/arch/x86_64/lib/clear_page.S
new file mode 100644
index 000000000000..30a9da458c15
--- /dev/null
+++ b/arch/x86_64/lib/clear_page.S
@@ -0,0 +1,50 @@
1/*
2 * Zero a page.
3 * rdi page
4 */
5 .globl clear_page
6 .p2align 4
7clear_page:
8 xorl %eax,%eax
9 movl $4096/64,%ecx
10 .p2align 4
11.Lloop:
12 decl %ecx
13#define PUT(x) movq %rax,x*8(%rdi)
14 movq %rax,(%rdi)
15 PUT(1)
16 PUT(2)
17 PUT(3)
18 PUT(4)
19 PUT(5)
20 PUT(6)
21 PUT(7)
22 leaq 64(%rdi),%rdi
23 jnz .Lloop
24 nop
25 ret
26clear_page_end:
27
28 /* C stepping K8 run faster using the string instructions.
29 It is also a lot simpler. Use this when possible */
30
31#include <asm/cpufeature.h>
32
33 .section .altinstructions,"a"
34 .align 8
35 .quad clear_page
36 .quad clear_page_c
37 .byte X86_FEATURE_K8_C
38 .byte clear_page_end-clear_page
39 .byte clear_page_c_end-clear_page_c
40 .previous
41
42 .section .altinstr_replacement,"ax"
43clear_page_c:
44 movl $4096/8,%ecx
45 xorl %eax,%eax
46 rep
47 stosq
48 ret
49clear_page_c_end:
50 .previous
diff --git a/arch/x86_64/lib/copy_page.S b/arch/x86_64/lib/copy_page.S
new file mode 100644
index 000000000000..dd3aa47b6bf5
--- /dev/null
+++ b/arch/x86_64/lib/copy_page.S
@@ -0,0 +1,101 @@
1/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
2
3/* Don't use streaming store because it's better when the target
4 ends up in cache. */
5
6/* Could vary the prefetch distance based on SMP/UP */
7
8 .globl copy_page
9 .p2align 4
10copy_page:
11 subq $3*8,%rsp
12 movq %rbx,(%rsp)
13 movq %r12,1*8(%rsp)
14 movq %r13,2*8(%rsp)
15
16 movl $(4096/64)-5,%ecx
17 .p2align 4
18.Loop64:
19 dec %rcx
20
21 movq (%rsi), %rax
22 movq 8 (%rsi), %rbx
23 movq 16 (%rsi), %rdx
24 movq 24 (%rsi), %r8
25 movq 32 (%rsi), %r9
26 movq 40 (%rsi), %r10
27 movq 48 (%rsi), %r11
28 movq 56 (%rsi), %r12
29
30 prefetcht0 5*64(%rsi)
31
32 movq %rax, (%rdi)
33 movq %rbx, 8 (%rdi)
34 movq %rdx, 16 (%rdi)
35 movq %r8, 24 (%rdi)
36 movq %r9, 32 (%rdi)
37 movq %r10, 40 (%rdi)
38 movq %r11, 48 (%rdi)
39 movq %r12, 56 (%rdi)
40
41 leaq 64 (%rsi), %rsi
42 leaq 64 (%rdi), %rdi
43
44 jnz .Loop64
45
46 movl $5,%ecx
47 .p2align 4
48.Loop2:
49 decl %ecx
50
51 movq (%rsi), %rax
52 movq 8 (%rsi), %rbx
53 movq 16 (%rsi), %rdx
54 movq 24 (%rsi), %r8
55 movq 32 (%rsi), %r9
56 movq 40 (%rsi), %r10
57 movq 48 (%rsi), %r11
58 movq 56 (%rsi), %r12
59
60 movq %rax, (%rdi)
61 movq %rbx, 8 (%rdi)
62 movq %rdx, 16 (%rdi)
63 movq %r8, 24 (%rdi)
64 movq %r9, 32 (%rdi)
65 movq %r10, 40 (%rdi)
66 movq %r11, 48 (%rdi)
67 movq %r12, 56 (%rdi)
68
69 leaq 64(%rdi),%rdi
70 leaq 64(%rsi),%rsi
71
72 jnz .Loop2
73
74 movq (%rsp),%rbx
75 movq 1*8(%rsp),%r12
76 movq 2*8(%rsp),%r13
77 addq $3*8,%rsp
78 ret
79
80 /* C stepping K8 run faster using the string copy instructions.
81 It is also a lot simpler. Use this when possible */
82
83#include <asm/cpufeature.h>
84
85 .section .altinstructions,"a"
86 .align 8
87 .quad copy_page
88 .quad copy_page_c
89 .byte X86_FEATURE_K8_C
90 .byte copy_page_c_end-copy_page_c
91 .byte copy_page_c_end-copy_page_c
92 .previous
93
94 .section .altinstr_replacement,"ax"
95copy_page_c:
96 movl $4096/8,%ecx
97 rep
98 movsq
99 ret
100copy_page_c_end:
101 .previous
diff --git a/arch/x86_64/lib/copy_user.S b/arch/x86_64/lib/copy_user.S
new file mode 100644
index 000000000000..bd556c804248
--- /dev/null
+++ b/arch/x86_64/lib/copy_user.S
@@ -0,0 +1,294 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v2.
3 *
4 * Functions to copy from and to user space.
5 */
6
7#define FIX_ALIGNMENT 1
8
9 #include <asm/current.h>
10 #include <asm/offset.h>
11 #include <asm/thread_info.h>
12 #include <asm/cpufeature.h>
13
14/* Standard copy_to_user with segment limit checking */
15 .globl copy_to_user
16 .p2align 4
17copy_to_user:
18 GET_THREAD_INFO(%rax)
19 movq %rdi,%rcx
20 addq %rdx,%rcx
21 jc bad_to_user
22 cmpq threadinfo_addr_limit(%rax),%rcx
23 jae bad_to_user
242:
25 .byte 0xe9 /* 32bit jump */
26 .long .Lcug-1f
271:
28
29 .section .altinstr_replacement,"ax"
303: .byte 0xe9 /* replacement jmp with 8 bit immediate */
31 .long copy_user_generic_c-1b /* offset */
32 .previous
33 .section .altinstructions,"a"
34 .align 8
35 .quad 2b
36 .quad 3b
37 .byte X86_FEATURE_K8_C
38 .byte 5
39 .byte 5
40 .previous
41
42/* Standard copy_from_user with segment limit checking */
43 .globl copy_from_user
44 .p2align 4
45copy_from_user:
46 GET_THREAD_INFO(%rax)
47 movq %rsi,%rcx
48 addq %rdx,%rcx
49 jc bad_from_user
50 cmpq threadinfo_addr_limit(%rax),%rcx
51 jae bad_from_user
52 /* FALL THROUGH to copy_user_generic */
53
54 .section .fixup,"ax"
55 /* must zero dest */
56bad_from_user:
57 movl %edx,%ecx
58 xorl %eax,%eax
59 rep
60 stosb
61bad_to_user:
62 movl %edx,%eax
63 ret
64 .previous
65
66
67/*
68 * copy_user_generic - memory copy with exception handling.
69 *
70 * Input:
71 * rdi destination
72 * rsi source
73 * rdx count
74 *
75 * Output:
76 * eax uncopied bytes or 0 if successful.
77 */
78 .globl copy_user_generic
79 .p2align 4
80copy_user_generic:
81 .byte 0x66,0x66,0x90 /* 5 byte nop for replacement jump */
82 .byte 0x66,0x90
831:
84 .section .altinstr_replacement,"ax"
852: .byte 0xe9 /* near jump with 32bit immediate */
86 .long copy_user_generic_c-1b /* offset */
87 .previous
88 .section .altinstructions,"a"
89 .align 8
90 .quad copy_user_generic
91 .quad 2b
92 .byte X86_FEATURE_K8_C
93 .byte 5
94 .byte 5
95 .previous
96.Lcug:
97 pushq %rbx
98 xorl %eax,%eax /*zero for the exception handler */
99
100#ifdef FIX_ALIGNMENT
101 /* check for bad alignment of destination */
102 movl %edi,%ecx
103 andl $7,%ecx
104 jnz .Lbad_alignment
105.Lafter_bad_alignment:
106#endif
107
108 movq %rdx,%rcx
109
110 movl $64,%ebx
111 shrq $6,%rdx
112 decq %rdx
113 js .Lhandle_tail
114
115 .p2align 4
116.Lloop:
117.Ls1: movq (%rsi),%r11
118.Ls2: movq 1*8(%rsi),%r8
119.Ls3: movq 2*8(%rsi),%r9
120.Ls4: movq 3*8(%rsi),%r10
121.Ld1: movq %r11,(%rdi)
122.Ld2: movq %r8,1*8(%rdi)
123.Ld3: movq %r9,2*8(%rdi)
124.Ld4: movq %r10,3*8(%rdi)
125
126.Ls5: movq 4*8(%rsi),%r11
127.Ls6: movq 5*8(%rsi),%r8
128.Ls7: movq 6*8(%rsi),%r9
129.Ls8: movq 7*8(%rsi),%r10
130.Ld5: movq %r11,4*8(%rdi)
131.Ld6: movq %r8,5*8(%rdi)
132.Ld7: movq %r9,6*8(%rdi)
133.Ld8: movq %r10,7*8(%rdi)
134
135 decq %rdx
136
137 leaq 64(%rsi),%rsi
138 leaq 64(%rdi),%rdi
139
140 jns .Lloop
141
142 .p2align 4
143.Lhandle_tail:
144 movl %ecx,%edx
145 andl $63,%ecx
146 shrl $3,%ecx
147 jz .Lhandle_7
148 movl $8,%ebx
149 .p2align 4
150.Lloop_8:
151.Ls9: movq (%rsi),%r8
152.Ld9: movq %r8,(%rdi)
153 decl %ecx
154 leaq 8(%rdi),%rdi
155 leaq 8(%rsi),%rsi
156 jnz .Lloop_8
157
158.Lhandle_7:
159 movl %edx,%ecx
160 andl $7,%ecx
161 jz .Lende
162 .p2align 4
163.Lloop_1:
164.Ls10: movb (%rsi),%bl
165.Ld10: movb %bl,(%rdi)
166 incq %rdi
167 incq %rsi
168 decl %ecx
169 jnz .Lloop_1
170
171.Lende:
172 popq %rbx
173 ret
174
175#ifdef FIX_ALIGNMENT
176 /* align destination */
177 .p2align 4
178.Lbad_alignment:
179 movl $8,%r9d
180 subl %ecx,%r9d
181 movl %r9d,%ecx
182 cmpq %r9,%rdx
183 jz .Lhandle_7
184 js .Lhandle_7
185.Lalign_1:
186.Ls11: movb (%rsi),%bl
187.Ld11: movb %bl,(%rdi)
188 incq %rsi
189 incq %rdi
190 decl %ecx
191 jnz .Lalign_1
192 subq %r9,%rdx
193 jmp .Lafter_bad_alignment
194#endif
195
196 /* table sorted by exception address */
197 .section __ex_table,"a"
198 .align 8
199 .quad .Ls1,.Ls1e
200 .quad .Ls2,.Ls2e
201 .quad .Ls3,.Ls3e
202 .quad .Ls4,.Ls4e
203 .quad .Ld1,.Ls1e
204 .quad .Ld2,.Ls2e
205 .quad .Ld3,.Ls3e
206 .quad .Ld4,.Ls4e
207 .quad .Ls5,.Ls5e
208 .quad .Ls6,.Ls6e
209 .quad .Ls7,.Ls7e
210 .quad .Ls8,.Ls8e
211 .quad .Ld5,.Ls5e
212 .quad .Ld6,.Ls6e
213 .quad .Ld7,.Ls7e
214 .quad .Ld8,.Ls8e
215 .quad .Ls9,.Le_quad
216 .quad .Ld9,.Le_quad
217 .quad .Ls10,.Le_byte
218 .quad .Ld10,.Le_byte
219#ifdef FIX_ALIGNMENT
220 .quad .Ls11,.Lzero_rest
221 .quad .Ld11,.Lzero_rest
222#endif
223 .quad .Le5,.Le_zero
224 .previous
225
226 /* compute 64-offset for main loop. 8 bytes accuracy with error on the
227 pessimistic side. this is gross. it would be better to fix the
228 interface. */
229 /* eax: zero, ebx: 64 */
230.Ls1e: addl $8,%eax
231.Ls2e: addl $8,%eax
232.Ls3e: addl $8,%eax
233.Ls4e: addl $8,%eax
234.Ls5e: addl $8,%eax
235.Ls6e: addl $8,%eax
236.Ls7e: addl $8,%eax
237.Ls8e: addl $8,%eax
238 addq %rbx,%rdi /* +64 */
239 subq %rax,%rdi /* correct destination with computed offset */
240
241 shlq $6,%rdx /* loop counter * 64 (stride length) */
242 addq %rax,%rdx /* add offset to loopcnt */
243 andl $63,%ecx /* remaining bytes */
244 addq %rcx,%rdx /* add them */
245 jmp .Lzero_rest
246
247 /* exception on quad word loop in tail handling */
248 /* ecx: loopcnt/8, %edx: length, rdi: correct */
249.Le_quad:
250 shll $3,%ecx
251 andl $7,%edx
252 addl %ecx,%edx
253 /* edx: bytes to zero, rdi: dest, eax:zero */
254.Lzero_rest:
255 movq %rdx,%rcx
256.Le_byte:
257 xorl %eax,%eax
258.Le5: rep
259 stosb
260 /* when there is another exception while zeroing the rest just return */
261.Le_zero:
262 movq %rdx,%rax
263 jmp .Lende
264
265 /* C stepping K8 run faster using the string copy instructions.
266 This is also a lot simpler. Use them when possible.
267 Patch in jmps to this code instead of copying it fully
268 to avoid unwanted aliasing in the exception tables. */
269
270 /* rdi destination
271 * rsi source
272 * rdx count
273 *
274 * Output:
275 * eax uncopied bytes or 0 if successfull.
276 */
277copy_user_generic_c:
278 movl %edx,%ecx
279 shrl $3,%ecx
280 andl $7,%edx
2811: rep
282 movsq
283 movl %edx,%ecx
2842: rep
285 movsb
2864: movl %ecx,%eax
287 ret
2883: lea (%rdx,%rcx,8),%rax
289 ret
290
291 .section __ex_table,"a"
292 .quad 1b,3b
293 .quad 2b,4b
294 .previous
diff --git a/arch/x86_64/lib/csum-copy.S b/arch/x86_64/lib/csum-copy.S
new file mode 100644
index 000000000000..01808ec37836
--- /dev/null
+++ b/arch/x86_64/lib/csum-copy.S
@@ -0,0 +1,233 @@
1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all.
7 */
8 #include <linux/linkage.h>
9 #include <asm/errno.h>
10
11/*
12 * Checksum copy with exception handling.
13 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
14 * destination is zeroed.
15 *
16 * Input
17 * rdi source
18 * rsi destination
19 * edx len (32bit)
20 * ecx sum (32bit)
21 * r8 src_err_ptr (int)
22 * r9 dst_err_ptr (int)
23 *
24 * Output
25 * eax 64bit sum. undefined in case of exception.
26 *
27 * Wrappers need to take care of valid exception sum and zeroing.
28 * They also should align source or destination to 8 bytes.
29 */
30
31 .macro source
3210:
33 .section __ex_table,"a"
34 .align 8
35 .quad 10b,.Lbad_source
36 .previous
37 .endm
38
39 .macro dest
4020:
41 .section __ex_table,"a"
42 .align 8
43 .quad 20b,.Lbad_dest
44 .previous
45 .endm
46
47 .macro ignore L=.Lignore
4830:
49 .section __ex_table,"a"
50 .align 8
51 .quad 30b,\L
52 .previous
53 .endm
54
55
56 .globl csum_partial_copy_generic
57 .p2align 4
58csum_partial_copy_generic:
59 cmpl $3*64,%edx
60 jle .Lignore
61
62.Lignore:
63 subq $7*8,%rsp
64 movq %rbx,2*8(%rsp)
65 movq %r12,3*8(%rsp)
66 movq %r14,4*8(%rsp)
67 movq %r13,5*8(%rsp)
68 movq %rbp,6*8(%rsp)
69
70 movq %r8,(%rsp)
71 movq %r9,1*8(%rsp)
72
73 movl %ecx,%eax
74 movl %edx,%ecx
75
76 xorl %r9d,%r9d
77 movq %rcx,%r12
78
79 shrq $6,%r12
80 jz .Lhandle_tail /* < 64 */
81
82 clc
83
84 /* main loop. clear in 64 byte blocks */
85 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
86 /* r11: temp3, rdx: temp4, r12 loopcnt */
87 /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
88 .p2align 4
89.Lloop:
90 source
91 movq (%rdi),%rbx
92 source
93 movq 8(%rdi),%r8
94 source
95 movq 16(%rdi),%r11
96 source
97 movq 24(%rdi),%rdx
98
99 source
100 movq 32(%rdi),%r10
101 source
102 movq 40(%rdi),%rbp
103 source
104 movq 48(%rdi),%r14
105 source
106 movq 56(%rdi),%r13
107
108 ignore 2f
109 prefetcht0 5*64(%rdi)
1102:
111 adcq %rbx,%rax
112 adcq %r8,%rax
113 adcq %r11,%rax
114 adcq %rdx,%rax
115 adcq %r10,%rax
116 adcq %rbp,%rax
117 adcq %r14,%rax
118 adcq %r13,%rax
119
120 decl %r12d
121
122 dest
123 movq %rbx,(%rsi)
124 dest
125 movq %r8,8(%rsi)
126 dest
127 movq %r11,16(%rsi)
128 dest
129 movq %rdx,24(%rsi)
130
131 dest
132 movq %r10,32(%rsi)
133 dest
134 movq %rbp,40(%rsi)
135 dest
136 movq %r14,48(%rsi)
137 dest
138 movq %r13,56(%rsi)
139
1403:
141
142 leaq 64(%rdi),%rdi
143 leaq 64(%rsi),%rsi
144
145 jnz .Lloop
146
147 adcq %r9,%rax
148
149 /* do last upto 56 bytes */
150.Lhandle_tail:
151 /* ecx: count */
152 movl %ecx,%r10d
153 andl $63,%ecx
154 shrl $3,%ecx
155 jz .Lfold
156 clc
157 .p2align 4
158.Lloop_8:
159 source
160 movq (%rdi),%rbx
161 adcq %rbx,%rax
162 decl %ecx
163 dest
164 movq %rbx,(%rsi)
165 leaq 8(%rsi),%rsi /* preserve carry */
166 leaq 8(%rdi),%rdi
167 jnz .Lloop_8
168 adcq %r9,%rax /* add in carry */
169
170.Lfold:
171 /* reduce checksum to 32bits */
172 movl %eax,%ebx
173 shrq $32,%rax
174 addl %ebx,%eax
175 adcl %r9d,%eax
176
177 /* do last upto 6 bytes */
178.Lhandle_7:
179 movl %r10d,%ecx
180 andl $7,%ecx
181 shrl $1,%ecx
182 jz .Lhandle_1
183 movl $2,%edx
184 xorl %ebx,%ebx
185 clc
186 .p2align 4
187.Lloop_1:
188 source
189 movw (%rdi),%bx
190 adcl %ebx,%eax
191 dest
192 decl %ecx
193 movw %bx,(%rsi)
194 leaq 2(%rdi),%rdi
195 leaq 2(%rsi),%rsi
196 jnz .Lloop_1
197 adcl %r9d,%eax /* add in carry */
198
199 /* handle last odd byte */
200.Lhandle_1:
201 testl $1,%r10d
202 jz .Lende
203 xorl %ebx,%ebx
204 source
205 movb (%rdi),%bl
206 dest
207 movb %bl,(%rsi)
208 addl %ebx,%eax
209 adcl %r9d,%eax /* carry */
210
211.Lende:
212 movq 2*8(%rsp),%rbx
213 movq 3*8(%rsp),%r12
214 movq 4*8(%rsp),%r14
215 movq 5*8(%rsp),%r13
216 movq 6*8(%rsp),%rbp
217 addq $7*8,%rsp
218 ret
219
220 /* Exception handlers. Very simple, zeroing is done in the wrappers */
221.Lbad_source:
222 movq (%rsp),%rax
223 testq %rax,%rax
224 jz .Lende
225 movl $-EFAULT,(%rax)
226 jmp .Lende
227
228.Lbad_dest:
229 movq 8(%rsp),%rax
230 testq %rax,%rax
231 jz .Lende
232 movl $-EFAULT,(%rax)
233 jmp .Lende
diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c
new file mode 100644
index 000000000000..5384e227cdf6
--- /dev/null
+++ b/arch/x86_64/lib/csum-partial.c
@@ -0,0 +1,150 @@
1/*
2 * arch/x86_64/lib/csum-partial.c
3 *
4 * This file contains network checksum routines that are better done
5 * in an architecture-specific manner due to speed.
6 */
7
8#include <linux/compiler.h>
9#include <linux/module.h>
10#include <asm/checksum.h>
11
12#define __force_inline inline __attribute__((always_inline))
13
14static inline unsigned short from32to16(unsigned a)
15{
16 unsigned short b = a >> 16;
17 asm("addw %w2,%w0\n\t"
18 "adcw $0,%w0\n"
19 : "=r" (b)
20 : "0" (b), "r" (a));
21 return b;
22}
23
24/*
25 * Do a 64-bit checksum on an arbitrary memory area.
26 * Returns a 32bit checksum.
27 *
28 * This isn't as time critical as it used to be because many NICs
29 * do hardware checksumming these days.
30 *
31 * Things tried and found to not make it faster:
32 * Manual Prefetching
33 * Unrolling to an 128 bytes inner loop.
34 * Using interleaving with more registers to break the carry chains.
35 */
36static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len)
37{
38 unsigned odd, count;
39 unsigned long result = 0;
40
41 if (unlikely(len == 0))
42 return result;
43 odd = 1 & (unsigned long) buff;
44 if (unlikely(odd)) {
45 result = *buff << 8;
46 len--;
47 buff++;
48 }
49 count = len >> 1; /* nr of 16-bit words.. */
50 if (count) {
51 if (2 & (unsigned long) buff) {
52 result += *(unsigned short *)buff;
53 count--;
54 len -= 2;
55 buff += 2;
56 }
57 count >>= 1; /* nr of 32-bit words.. */
58 if (count) {
59 unsigned long zero;
60 unsigned count64;
61 if (4 & (unsigned long) buff) {
62 result += *(unsigned int *) buff;
63 count--;
64 len -= 4;
65 buff += 4;
66 }
67 count >>= 1; /* nr of 64-bit words.. */
68
69 /* main loop using 64byte blocks */
70 zero = 0;
71 count64 = count >> 3;
72 while (count64) {
73 asm("addq 0*8(%[src]),%[res]\n\t"
74 "adcq 1*8(%[src]),%[res]\n\t"
75 "adcq 2*8(%[src]),%[res]\n\t"
76 "adcq 3*8(%[src]),%[res]\n\t"
77 "adcq 4*8(%[src]),%[res]\n\t"
78 "adcq 5*8(%[src]),%[res]\n\t"
79 "adcq 6*8(%[src]),%[res]\n\t"
80 "adcq 7*8(%[src]),%[res]\n\t"
81 "adcq %[zero],%[res]"
82 : [res] "=r" (result)
83 : [src] "r" (buff), [zero] "r" (zero),
84 "[res]" (result));
85 buff += 64;
86 count64--;
87 }
88
89 /* last upto 7 8byte blocks */
90 count %= 8;
91 while (count) {
92 asm("addq %1,%0\n\t"
93 "adcq %2,%0\n"
94 : "=r" (result)
95 : "m" (*(unsigned long *)buff),
96 "r" (zero), "0" (result));
97 --count;
98 buff += 8;
99 }
100 result = add32_with_carry(result>>32,
101 result&0xffffffff);
102
103 if (len & 4) {
104 result += *(unsigned int *) buff;
105 buff += 4;
106 }
107 }
108 if (len & 2) {
109 result += *(unsigned short *) buff;
110 buff += 2;
111 }
112 }
113 if (len & 1)
114 result += *buff;
115 result = add32_with_carry(result>>32, result & 0xffffffff);
116 if (unlikely(odd)) {
117 result = from32to16(result);
118 result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
119 }
120 return result;
121}
122
123/*
124 * computes the checksum of a memory block at buff, length len,
125 * and adds in "sum" (32-bit)
126 *
127 * returns a 32-bit number suitable for feeding into itself
128 * or csum_tcpudp_magic
129 *
130 * this function must be called with even lengths, except
131 * for the last fragment, which may be odd
132 *
133 * it's best to have buff aligned on a 64-bit boundary
134 */
135unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum)
136{
137 return add32_with_carry(do_csum(buff, len), sum);
138}
139
140EXPORT_SYMBOL(csum_partial);
141
142/*
143 * this routine is used for miscellaneous IP-like checksums, mainly
144 * in icmp.c
145 */
146unsigned short ip_compute_csum(unsigned char * buff, int len)
147{
148 return csum_fold(csum_partial(buff,len,0));
149}
150
diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c
new file mode 100644
index 000000000000..94323f20816e
--- /dev/null
+++ b/arch/x86_64/lib/csum-wrappers.c
@@ -0,0 +1,129 @@
1/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
2 * Subject to the GNU Public License v.2
3 *
4 * Wrappers of assembly checksum functions for x86-64.
5 */
6
7#include <asm/checksum.h>
8#include <linux/module.h>
9
10/**
11 * csum_partial_copy_from_user - Copy and checksum from user space.
12 * @src: source address (user space)
13 * @dst: destination address
14 * @len: number of bytes to be copied.
15 * @isum: initial sum that is added into the result (32bit unfolded)
16 * @errp: set to -EFAULT for an bad source address.
17 *
18 * Returns an 32bit unfolded checksum of the buffer.
19 * src and dst are best aligned to 64bits.
20 */
21unsigned int
22csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst,
23 int len, unsigned int isum, int *errp)
24{
25 might_sleep();
26 *errp = 0;
27 if (likely(access_ok(VERIFY_READ,src, len))) {
28 /* Why 6, not 7? To handle odd addresses aligned we
29 would need to do considerable complications to fix the
30 checksum which is defined as an 16bit accumulator. The
31 fix alignment code is primarily for performance
32 compatibility with 32bit and that will handle odd
33 addresses slowly too. */
34 if (unlikely((unsigned long)src & 6)) {
35 while (((unsigned long)src & 6) && len >= 2) {
36 __u16 val16;
37 *errp = __get_user(val16, (__u16 __user *)src);
38 if (*errp)
39 return isum;
40 *(__u16 *)dst = val16;
41 isum = add32_with_carry(isum, val16);
42 src += 2;
43 dst += 2;
44 len -= 2;
45 }
46 }
47 isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL);
48 if (likely(*errp == 0))
49 return isum;
50 }
51 *errp = -EFAULT;
52 memset(dst,0,len);
53 return isum;
54}
55
56EXPORT_SYMBOL(csum_partial_copy_from_user);
57
58/**
59 * csum_partial_copy_to_user - Copy and checksum to user space.
60 * @src: source address
61 * @dst: destination address (user space)
62 * @len: number of bytes to be copied.
63 * @isum: initial sum that is added into the result (32bit unfolded)
64 * @errp: set to -EFAULT for an bad destination address.
65 *
66 * Returns an 32bit unfolded checksum of the buffer.
67 * src and dst are best aligned to 64bits.
68 */
69unsigned int
70csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst,
71 int len, unsigned int isum, int *errp)
72{
73 might_sleep();
74 if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
75 *errp = -EFAULT;
76 return 0;
77 }
78
79 if (unlikely((unsigned long)dst & 6)) {
80 while (((unsigned long)dst & 6) && len >= 2) {
81 __u16 val16 = *(__u16 *)src;
82 isum = add32_with_carry(isum, val16);
83 *errp = __put_user(val16, (__u16 __user *)dst);
84 if (*errp)
85 return isum;
86 src += 2;
87 dst += 2;
88 len -= 2;
89 }
90 }
91
92 *errp = 0;
93 return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp);
94}
95
96EXPORT_SYMBOL(csum_partial_copy_to_user);
97
98/**
99 * csum_partial_copy_nocheck - Copy and checksum.
100 * @src: source address
101 * @dst: destination address
102 * @len: number of bytes to be copied.
103 * @isum: initial sum that is added into the result (32bit unfolded)
104 *
105 * Returns an 32bit unfolded checksum of the buffer.
106 */
107unsigned int
108csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, unsigned int sum)
109{
110 return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
111}
112
113unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr,
114 __u32 len, unsigned short proto, unsigned int sum)
115{
116 __u64 rest, sum64;
117
118 rest = (__u64)htonl(len) + (__u64)htons(proto) + (__u64)sum;
119 asm(" addq (%[saddr]),%[sum]\n"
120 " adcq 8(%[saddr]),%[sum]\n"
121 " adcq (%[daddr]),%[sum]\n"
122 " adcq 8(%[daddr]),%[sum]\n"
123 " adcq $0,%[sum]\n"
124 : [sum] "=r" (sum64)
125 : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
126 return csum_fold(add32_with_carry(sum64 & 0xffffffff, sum64>>32));
127}
128
129EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86_64/lib/dec_and_lock.c b/arch/x86_64/lib/dec_and_lock.c
new file mode 100644
index 000000000000..ab43394dc775
--- /dev/null
+++ b/arch/x86_64/lib/dec_and_lock.c
@@ -0,0 +1,40 @@
1/*
2 * x86 version of "atomic_dec_and_lock()" using
3 * the atomic "cmpxchg" instruction.
4 *
5 * (For CPU's lacking cmpxchg, we use the slow
6 * generic version, and this one never even gets
7 * compiled).
8 */
9
10#include <linux/spinlock.h>
11#include <asm/atomic.h>
12
13int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
14{
15 int counter;
16 int newcount;
17
18repeat:
19 counter = atomic_read(atomic);
20 newcount = counter-1;
21
22 if (!newcount)
23 goto slow_path;
24
25 asm volatile("lock; cmpxchgl %1,%2"
26 :"=a" (newcount)
27 :"r" (newcount), "m" (atomic->counter), "0" (counter));
28
29 /* If the above failed, "eax" will have changed */
30 if (newcount != counter)
31 goto repeat;
32 return 0;
33
34slow_path:
35 spin_lock(lock);
36 if (atomic_dec_and_test(atomic))
37 return 1;
38 spin_unlock(lock);
39 return 0;
40}
diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c
new file mode 100644
index 000000000000..6e2d66472eb1
--- /dev/null
+++ b/arch/x86_64/lib/delay.c
@@ -0,0 +1,48 @@
1/*
2 * Precise Delay Loops for x86-64
3 *
4 * Copyright (C) 1993 Linus Torvalds
5 * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
6 *
7 * The __delay function must _NOT_ be inlined as its execution time
8 * depends wildly on alignment on many x86 processors.
9 */
10
11#include <linux/config.h>
12#include <linux/sched.h>
13#include <linux/delay.h>
14#include <asm/delay.h>
15
16#ifdef CONFIG_SMP
17#include <asm/smp.h>
18#endif
19
20int x86_udelay_tsc = 0; /* Delay via TSC */
21
22void __delay(unsigned long loops)
23{
24 unsigned bclock, now;
25
26 rdtscl(bclock);
27 do
28 {
29 rep_nop();
30 rdtscl(now);
31 }
32 while((now-bclock) < loops);
33}
34
35inline void __const_udelay(unsigned long xloops)
36{
37 __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ);
38}
39
40void __udelay(unsigned long usecs)
41{
42 __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */
43}
44
45void __ndelay(unsigned long nsecs)
46{
47 __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
48}
diff --git a/arch/x86_64/lib/getuser.S b/arch/x86_64/lib/getuser.S
new file mode 100644
index 000000000000..f94ea8a44051
--- /dev/null
+++ b/arch/x86_64/lib/getuser.S
@@ -0,0 +1,101 @@
1/*
2 * __get_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __get_user_X
15 *
16 * Inputs: %rcx contains the address.
17 * The register is modified, but all changes are undone
18 * before returning because the C code doesn't know about it.
19 *
20 * Outputs: %rax is error code (0 or -EFAULT)
21 * %rdx contains zero-extended value
22 *
23 * %r8 is destroyed.
24 *
25 * These functions should not modify any other registers,
26 * as they get called from within inline assembly.
27 */
28
29#include <linux/linkage.h>
30#include <asm/page.h>
31#include <asm/errno.h>
32#include <asm/offset.h>
33#include <asm/thread_info.h>
34
35 .text
36 .p2align 4
37.globl __get_user_1
38__get_user_1:
39 GET_THREAD_INFO(%r8)
40 cmpq threadinfo_addr_limit(%r8),%rcx
41 jae bad_get_user
421: movzb (%rcx),%edx
43 xorl %eax,%eax
44 ret
45
46 .p2align 4
47.globl __get_user_2
48__get_user_2:
49 GET_THREAD_INFO(%r8)
50 addq $1,%rcx
51 jc 20f
52 cmpq threadinfo_addr_limit(%r8),%rcx
53 jae 20f
54 decq %rcx
552: movzwl (%rcx),%edx
56 xorl %eax,%eax
57 ret
5820: decq %rcx
59 jmp bad_get_user
60
61 .p2align 4
62.globl __get_user_4
63__get_user_4:
64 GET_THREAD_INFO(%r8)
65 addq $3,%rcx
66 jc 30f
67 cmpq threadinfo_addr_limit(%r8),%rcx
68 jae 30f
69 subq $3,%rcx
703: movl (%rcx),%edx
71 xorl %eax,%eax
72 ret
7330: subq $3,%rcx
74 jmp bad_get_user
75
76 .p2align 4
77.globl __get_user_8
78__get_user_8:
79 GET_THREAD_INFO(%r8)
80 addq $7,%rcx
81 jc bad_get_user
82 cmpq threadinfo_addr_limit(%r8),%rcx
83 jae bad_get_user
84 subq $7,%rcx
854: movq (%rcx),%rdx
86 xorl %eax,%eax
87 ret
8840: subq $7,%rcx
89 jmp bad_get_user
90
91bad_get_user:
92 xorl %edx,%edx
93 movq $(-EFAULT),%rax
94 ret
95
96.section __ex_table,"a"
97 .quad 1b,bad_get_user
98 .quad 2b,bad_get_user
99 .quad 3b,bad_get_user
100 .quad 4b,bad_get_user
101.previous
diff --git a/arch/x86_64/lib/io.c b/arch/x86_64/lib/io.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86_64/lib/io.c
@@ -0,0 +1,23 @@
1#include <linux/string.h>
2#include <asm/io.h>
3#include <linux/module.h>
4
5void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
6{
7 __inline_memcpy((void *) dst,src,len);
8}
9EXPORT_SYMBOL(__memcpy_toio);
10
11void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
12{
13 __inline_memcpy(dst,(const void *) src,len);
14}
15EXPORT_SYMBOL(__memcpy_fromio);
16
17void memset_io(volatile void __iomem *a, int b, size_t c)
18{
19 /* XXX: memset can mangle the IO patterns quite a bit.
20 perhaps it would be better to use a dumb one */
21 memset((void *)a,b,c);
22}
23EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86_64/lib/memcpy.S b/arch/x86_64/lib/memcpy.S
new file mode 100644
index 000000000000..c6c46494fef5
--- /dev/null
+++ b/arch/x86_64/lib/memcpy.S
@@ -0,0 +1,121 @@
1/* Copyright 2002 Andi Kleen */
2
3 #include <asm/cpufeature.h>
4/*
5 * memcpy - Copy a memory block.
6 *
7 * Input:
8 * rdi destination
9 * rsi source
10 * rdx count
11 *
12 * Output:
13 * rax original destination
14 */
15
16 .globl __memcpy
17 .globl memcpy
18 .p2align 4
19__memcpy:
20memcpy:
21 pushq %rbx
22 movq %rdi,%rax
23
24 movl %edx,%ecx
25 shrl $6,%ecx
26 jz .Lhandle_tail
27
28 .p2align 4
29.Lloop_64:
30 decl %ecx
31
32 movq (%rsi),%r11
33 movq 8(%rsi),%r8
34
35 movq %r11,(%rdi)
36 movq %r8,1*8(%rdi)
37
38 movq 2*8(%rsi),%r9
39 movq 3*8(%rsi),%r10
40
41 movq %r9,2*8(%rdi)
42 movq %r10,3*8(%rdi)
43
44 movq 4*8(%rsi),%r11
45 movq 5*8(%rsi),%r8
46
47 movq %r11,4*8(%rdi)
48 movq %r8,5*8(%rdi)
49
50 movq 6*8(%rsi),%r9
51 movq 7*8(%rsi),%r10
52
53 movq %r9,6*8(%rdi)
54 movq %r10,7*8(%rdi)
55
56 leaq 64(%rsi),%rsi
57 leaq 64(%rdi),%rdi
58 jnz .Lloop_64
59
60.Lhandle_tail:
61 movl %edx,%ecx
62 andl $63,%ecx
63 shrl $3,%ecx
64 jz .Lhandle_7
65 .p2align 4
66.Lloop_8:
67 decl %ecx
68 movq (%rsi),%r8
69 movq %r8,(%rdi)
70 leaq 8(%rdi),%rdi
71 leaq 8(%rsi),%rsi
72 jnz .Lloop_8
73
74.Lhandle_7:
75 movl %edx,%ecx
76 andl $7,%ecx
77 jz .Lende
78 .p2align 4
79.Lloop_1:
80 movb (%rsi),%r8b
81 movb %r8b,(%rdi)
82 incq %rdi
83 incq %rsi
84 decl %ecx
85 jnz .Lloop_1
86
87.Lende:
88 popq %rbx
89 ret
90.Lfinal:
91
92 /* C stepping K8 run faster using the string copy instructions.
93 It is also a lot simpler. Use this when possible */
94
95 .section .altinstructions,"a"
96 .align 8
97 .quad memcpy
98 .quad memcpy_c
99 .byte X86_FEATURE_K8_C
100 .byte .Lfinal-memcpy
101 .byte memcpy_c_end-memcpy_c
102 .previous
103
104 .section .altinstr_replacement,"ax"
105 /* rdi destination
106 * rsi source
107 * rdx count
108 */
109memcpy_c:
110 movq %rdi,%rax
111 movl %edx,%ecx
112 shrl $3,%ecx
113 andl $7,%edx
114 rep
115 movsq
116 movl %edx,%ecx
117 rep
118 movsb
119 ret
120memcpy_c_end:
121 .previous
diff --git a/arch/x86_64/lib/memmove.c b/arch/x86_64/lib/memmove.c
new file mode 100644
index 000000000000..e93d5255fdc9
--- /dev/null
+++ b/arch/x86_64/lib/memmove.c
@@ -0,0 +1,19 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6
7#undef memmove
8void *memmove(void * dest,const void *src,size_t count)
9{
10 if (dest < src) {
11 __inline_memcpy(dest,src,count);
12 } else {
13 char *p = (char *) dest + count;
14 char *s = (char *) src + count;
15 while (count--)
16 *--p = *--s;
17 }
18 return dest;
19}
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
new file mode 100644
index 000000000000..4b4c40638640
--- /dev/null
+++ b/arch/x86_64/lib/memset.S
@@ -0,0 +1,125 @@
1/* Copyright 2002 Andi Kleen, SuSE Labs */
2/*
3 * ISO C memset - set a memory block to a byte value.
4 *
5 * rdi destination
6 * rsi value (char)
7 * rdx count (bytes)
8 *
9 * rax original destination
10 */
11 .globl __memset
12 .globl memset
13 .p2align 4
14memset:
15__memset:
16 movq %rdi,%r10
17 movq %rdx,%r11
18
19 /* expand byte value */
20 movzbl %sil,%ecx
21 movabs $0x0101010101010101,%rax
22 mul %rcx /* with rax, clobbers rdx */
23
24 /* align dst */
25 movl %edi,%r9d
26 andl $7,%r9d
27 jnz .Lbad_alignment
28.Lafter_bad_alignment:
29
30 movl %r11d,%ecx
31 shrl $6,%ecx
32 jz .Lhandle_tail
33
34 .p2align 4
35.Lloop_64:
36 decl %ecx
37 movq %rax,(%rdi)
38 movq %rax,8(%rdi)
39 movq %rax,16(%rdi)
40 movq %rax,24(%rdi)
41 movq %rax,32(%rdi)
42 movq %rax,40(%rdi)
43 movq %rax,48(%rdi)
44 movq %rax,56(%rdi)
45 leaq 64(%rdi),%rdi
46 jnz .Lloop_64
47
48 /* Handle tail in loops. The loops should be faster than hard
49 to predict jump tables. */
50 .p2align 4
51.Lhandle_tail:
52 movl %r11d,%ecx
53 andl $63&(~7),%ecx
54 jz .Lhandle_7
55 shrl $3,%ecx
56 .p2align 4
57.Lloop_8:
58 decl %ecx
59 movq %rax,(%rdi)
60 leaq 8(%rdi),%rdi
61 jnz .Lloop_8
62
63.Lhandle_7:
64 movl %r11d,%ecx
65 andl $7,%ecx
66 jz .Lende
67 .p2align 4
68.Lloop_1:
69 decl %ecx
70 movb %al,(%rdi)
71 leaq 1(%rdi),%rdi
72 jnz .Lloop_1
73
74.Lende:
75 movq %r10,%rax
76 ret
77
78.Lbad_alignment:
79 cmpq $7,%r11
80 jbe .Lhandle_7
81 movq %rax,(%rdi) /* unaligned store */
82 movq $8,%r8
83 subq %r9,%r8
84 addq %r8,%rdi
85 subq %r8,%r11
86 jmp .Lafter_bad_alignment
87
88 /* C stepping K8 run faster using the string instructions.
89 It is also a lot simpler. Use this when possible */
90
91#include <asm/cpufeature.h>
92
93 .section .altinstructions,"a"
94 .align 8
95 .quad memset
96 .quad memset_c
97 .byte X86_FEATURE_K8_C
98 .byte memset_c_end-memset_c
99 .byte memset_c_end-memset_c
100 .previous
101
102 .section .altinstr_replacement,"ax"
103 /* rdi destination
104 * rsi value
105 * rdx count
106 */
107memset_c:
108 movq %rdi,%r9
109 movl %edx,%r8d
110 andl $7,%r8d
111 movl %edx,%ecx
112 shrl $3,%ecx
113 /* expand byte value */
114 movzbl %sil,%esi
115 movabs $0x0101010101010101,%rax
116 mulq %rsi /* with rax, clobbers rdx */
117 rep
118 stosq
119 movl %r8d,%ecx
120 rep
121 stosb
122 movq %r9,%rax
123 ret
124memset_c_end:
125 .previous
diff --git a/arch/x86_64/lib/putuser.S b/arch/x86_64/lib/putuser.S
new file mode 100644
index 000000000000..0dee1fdcb162
--- /dev/null
+++ b/arch/x86_64/lib/putuser.S
@@ -0,0 +1,89 @@
1/*
2 * __put_user functions.
3 *
4 * (C) Copyright 1998 Linus Torvalds
5 * (C) Copyright 2005 Andi Kleen
6 *
7 * These functions have a non-standard call interface
8 * to make them more efficient, especially as they
9 * return an error value in addition to the "real"
10 * return value.
11 */
12
13/*
14 * __put_user_X
15 *
16 * Inputs: %rcx contains the address
17 * %rdx contains new value
18 *
19 * Outputs: %rax is error code (0 or -EFAULT)
20 *
21 * %r8 is destroyed.
22 *
23 * These functions should not modify any other registers,
24 * as they get called from within inline assembly.
25 */
26
27#include <linux/linkage.h>
28#include <asm/page.h>
29#include <asm/errno.h>
30#include <asm/offset.h>
31#include <asm/thread_info.h>
32
33 .text
34 .p2align 4
35.globl __put_user_1
36__put_user_1:
37 GET_THREAD_INFO(%r8)
38 cmpq threadinfo_addr_limit(%r8),%rcx
39 jae bad_put_user
401: movb %dl,(%rcx)
41 xorl %eax,%eax
42 ret
43
44 .p2align 4
45.globl __put_user_2
46__put_user_2:
47 GET_THREAD_INFO(%r8)
48 addq $1,%rcx
49 jc bad_put_user
50 cmpq threadinfo_addr_limit(%r8),%rcx
51 jae bad_put_user
522: movw %dx,-1(%rcx)
53 xorl %eax,%eax
54 ret
55
56 .p2align 4
57.globl __put_user_4
58__put_user_4:
59 GET_THREAD_INFO(%r8)
60 addq $3,%rcx
61 jc bad_put_user
62 cmpq threadinfo_addr_limit(%r8),%rcx
63 jae bad_put_user
643: movl %edx,-3(%rcx)
65 xorl %eax,%eax
66 ret
67
68 .p2align 4
69.globl __put_user_8
70__put_user_8:
71 GET_THREAD_INFO(%r8)
72 addq $7,%rcx
73 jc bad_put_user
74 cmpq threadinfo_addr_limit(%r8),%rcx
75 jae bad_put_user
764: movq %rdx,-7(%rcx)
77 xorl %eax,%eax
78 ret
79
80bad_put_user:
81 movq $(-EFAULT),%rax
82 ret
83
84.section __ex_table,"a"
85 .quad 1b,bad_put_user
86 .quad 2b,bad_put_user
87 .quad 3b,bad_put_user
88 .quad 4b,bad_put_user
89.previous
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
new file mode 100644
index 000000000000..acc1e2ca7ed7
--- /dev/null
+++ b/arch/x86_64/lib/thunk.S
@@ -0,0 +1,95 @@
1 /*
2 * Save registers before calling assembly functions. This avoids
3 * disturbance of register allocation in some inline assembly constructs.
4 * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
5 * Subject to the GNU public license, v.2. No warranty of any kind.
6 * $Id: thunk.S,v 1.2 2002/03/13 20:06:58 ak Exp $
7 */
8
9 #include <linux/config.h>
10 #include <linux/linkage.h>
11 #include <asm/dwarf2.h>
12 #include <asm/calling.h>
13 #include <asm/rwlock.h>
14
15 /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
16 .macro thunk name,func
17 .globl \name
18\name:
19 CFI_STARTPROC
20 SAVE_ARGS
21 call \func
22 jmp restore
23 CFI_ENDPROC
24 .endm
25
26 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
27 .macro thunk_retrax name,func
28 .globl \name
29\name:
30 CFI_STARTPROC
31 SAVE_ARGS
32 call \func
33 jmp restore_norax
34 CFI_ENDPROC
35 .endm
36
37
38 .section .sched.text
39#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
40 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
41 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
42 thunk rwsem_wake_thunk,rwsem_wake
43 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
44#endif
45 thunk do_softirq_thunk,do_softirq
46
47 thunk __down_failed,__down
48 thunk_retrax __down_failed_interruptible,__down_interruptible
49 thunk_retrax __down_failed_trylock,__down_trylock
50 thunk __up_wakeup,__up
51
52 /* SAVE_ARGS below is used only for the .cfi directives it contains. */
53 CFI_STARTPROC
54 SAVE_ARGS
55restore:
56 RESTORE_ARGS
57 ret
58 CFI_ENDPROC
59
60 CFI_STARTPROC
61 SAVE_ARGS
62restore_norax:
63 RESTORE_ARGS 1
64 ret
65 CFI_ENDPROC
66
67#ifdef CONFIG_SMP
68/* Support for read/write spinlocks. */
69 .text
70/* rax: pointer to rwlock_t */
71ENTRY(__write_lock_failed)
72 lock
73 addl $RW_LOCK_BIAS,(%rax)
741: rep
75 nop
76 cmpl $RW_LOCK_BIAS,(%rax)
77 jne 1b
78 lock
79 subl $RW_LOCK_BIAS,(%rax)
80 jnz __write_lock_failed
81 ret
82
83/* rax: pointer to rwlock_t */
84ENTRY(__read_lock_failed)
85 lock
86 incl (%rax)
871: rep
88 nop
89 cmpl $1,(%rax)
90 js 1b
91 lock
92 decl (%rax)
93 js __read_lock_failed
94 ret
95#endif
diff --git a/arch/x86_64/lib/usercopy.c b/arch/x86_64/lib/usercopy.c
new file mode 100644
index 000000000000..db8abba1ad81
--- /dev/null
+++ b/arch/x86_64/lib/usercopy.c
@@ -0,0 +1,153 @@
1/*
2 * User address space access functions.
3 *
4 * Copyright 1997 Andi Kleen <ak@muc.de>
5 * Copyright 1997 Linus Torvalds
6 * Copyright 2002 Andi Kleen <ak@suse.de>
7 */
8#include <asm/uaccess.h>
9
10/*
11 * Copy a null terminated string from userspace.
12 */
13
14#define __do_strncpy_from_user(dst,src,count,res) \
15do { \
16 long __d0, __d1, __d2; \
17 might_sleep(); \
18 __asm__ __volatile__( \
19 " testq %1,%1\n" \
20 " jz 2f\n" \
21 "0: lodsb\n" \
22 " stosb\n" \
23 " testb %%al,%%al\n" \
24 " jz 1f\n" \
25 " decq %1\n" \
26 " jnz 0b\n" \
27 "1: subq %1,%0\n" \
28 "2:\n" \
29 ".section .fixup,\"ax\"\n" \
30 "3: movq %5,%0\n" \
31 " jmp 2b\n" \
32 ".previous\n" \
33 ".section __ex_table,\"a\"\n" \
34 " .align 8\n" \
35 " .quad 0b,3b\n" \
36 ".previous" \
37 : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
38 "=&D" (__d2) \
39 : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
40 : "memory"); \
41} while (0)
42
43long
44__strncpy_from_user(char *dst, const char __user *src, long count)
45{
46 long res;
47 __do_strncpy_from_user(dst, src, count, res);
48 return res;
49}
50
51long
52strncpy_from_user(char *dst, const char __user *src, long count)
53{
54 long res = -EFAULT;
55 if (access_ok(VERIFY_READ, src, 1))
56 __do_strncpy_from_user(dst, src, count, res);
57 return res;
58}
59
60/*
61 * Zero Userspace
62 */
63
64unsigned long __clear_user(void __user *addr, unsigned long size)
65{
66 long __d0;
67 might_sleep();
68 /* no memory constraint because it doesn't change any memory gcc knows
69 about */
70 asm volatile(
71 " testq %[size8],%[size8]\n"
72 " jz 4f\n"
73 "0: movq %[zero],(%[dst])\n"
74 " addq %[eight],%[dst]\n"
75 " decl %%ecx ; jnz 0b\n"
76 "4: movq %[size1],%%rcx\n"
77 " testl %%ecx,%%ecx\n"
78 " jz 2f\n"
79 "1: movb %b[zero],(%[dst])\n"
80 " incq %[dst]\n"
81 " decl %%ecx ; jnz 1b\n"
82 "2:\n"
83 ".section .fixup,\"ax\"\n"
84 "3: lea 0(%[size1],%[size8],8),%[size8]\n"
85 " jmp 2b\n"
86 ".previous\n"
87 ".section __ex_table,\"a\"\n"
88 " .align 8\n"
89 " .quad 0b,3b\n"
90 " .quad 1b,2b\n"
91 ".previous"
92 : [size8] "=c"(size), [dst] "=&D" (__d0)
93 : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
94 [zero] "r" (0UL), [eight] "r" (8UL));
95 return size;
96}
97
98
99unsigned long clear_user(void __user *to, unsigned long n)
100{
101 if (access_ok(VERIFY_WRITE, to, n))
102 return __clear_user(to, n);
103 return n;
104}
105
106/*
107 * Return the size of a string (including the ending 0)
108 *
109 * Return 0 on exception, a value greater than N if too long
110 */
111
112long strnlen_user(const char __user *s, long n)
113{
114 long res = 0;
115 char c;
116
117 if (!access_ok(VERIFY_READ, s, n))
118 return 0;
119
120 while (1) {
121 if (res>n)
122 return n+1;
123 if (__get_user(c, s))
124 return 0;
125 if (!c)
126 return res+1;
127 res++;
128 s++;
129 }
130}
131
132long strlen_user(const char __user *s)
133{
134 long res = 0;
135 char c;
136
137 for (;;) {
138 if (get_user(c, s))
139 return 0;
140 if (!c)
141 return res+1;
142 res++;
143 s++;
144 }
145}
146
147unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
148{
149 if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
150 return copy_user_generic((__force void *)to, (__force void *)from, len);
151 }
152 return len;
153}