aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 16:46:28 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-10-21 16:46:28 -0400
commit8d8d2e9ccd331a1345c88b292ebee9d256fd8749 (patch)
tree605e038fa15477453a94ef48cc25a092fe2ac9e8
parent2a8b67fb72c4c4bc15fe8095e3ed613789c8b82f (diff)
parent3b4b682becdfa9f42321aa024d5cc84f71f06d8c (diff)
Merge branch 'x86-mem-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-mem-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: x86, mem: Optimize memmove for small size and unaligned cases x86, mem: Optimize memcpy by avoiding memory false dependece x86, mem: Don't implement forward memmove() as memcpy()
-rw-r--r--arch/x86/lib/memcpy_32.c199
-rw-r--r--arch/x86/lib/memcpy_64.S158
-rw-r--r--arch/x86/lib/memmove_64.c189
3 files changed, 465 insertions, 81 deletions
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f53..b908a59eccf5 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
22 22
23void *memmove(void *dest, const void *src, size_t n) 23void *memmove(void *dest, const void *src, size_t n)
24{ 24{
25 int d0, d1, d2; 25 int d0,d1,d2,d3,d4,d5;
26 26 char *ret = dest;
27 if (dest < src) { 27
28 memcpy(dest, src, n); 28 __asm__ __volatile__(
29 } else { 29 /* Handle more 16bytes in loop */
30 __asm__ __volatile__( 30 "cmp $0x10, %0\n\t"
31 "std\n\t" 31 "jb 1f\n\t"
32 "rep\n\t" 32
33 "movsb\n\t" 33 /* Decide forward/backward copy mode */
34 "cld" 34 "cmp %2, %1\n\t"
35 : "=&c" (d0), "=&S" (d1), "=&D" (d2) 35 "jb 2f\n\t"
36 :"0" (n), 36
37 "1" (n-1+src), 37 /*
38 "2" (n-1+dest) 38 * movs instruction have many startup latency
39 :"memory"); 39 * so we handle small size by general register.
40 } 40 */
41 return dest; 41 "cmp $680, %0\n\t"
42 "jb 3f\n\t"
43 /*
44 * movs instruction is only good for aligned case.
45 */
46 "mov %1, %3\n\t"
47 "xor %2, %3\n\t"
48 "and $0xff, %3\n\t"
49 "jz 4f\n\t"
50 "3:\n\t"
51 "sub $0x10, %0\n\t"
52
53 /*
54 * We gobble 16byts forward in each loop.
55 */
56 "3:\n\t"
57 "sub $0x10, %0\n\t"
58 "mov 0*4(%1), %3\n\t"
59 "mov 1*4(%1), %4\n\t"
60 "mov %3, 0*4(%2)\n\t"
61 "mov %4, 1*4(%2)\n\t"
62 "mov 2*4(%1), %3\n\t"
63 "mov 3*4(%1), %4\n\t"
64 "mov %3, 2*4(%2)\n\t"
65 "mov %4, 3*4(%2)\n\t"
66 "lea 0x10(%1), %1\n\t"
67 "lea 0x10(%2), %2\n\t"
68 "jae 3b\n\t"
69 "add $0x10, %0\n\t"
70 "jmp 1f\n\t"
71
72 /*
73 * Handle data forward by movs.
74 */
75 ".p2align 4\n\t"
76 "4:\n\t"
77 "mov -4(%1, %0), %3\n\t"
78 "lea -4(%2, %0), %4\n\t"
79 "shr $2, %0\n\t"
80 "rep movsl\n\t"
81 "mov %3, (%4)\n\t"
82 "jmp 11f\n\t"
83 /*
84 * Handle data backward by movs.
85 */
86 ".p2align 4\n\t"
87 "6:\n\t"
88 "mov (%1), %3\n\t"
89 "mov %2, %4\n\t"
90 "lea -4(%1, %0), %1\n\t"
91 "lea -4(%2, %0), %2\n\t"
92 "shr $2, %0\n\t"
93 "std\n\t"
94 "rep movsl\n\t"
95 "mov %3,(%4)\n\t"
96 "cld\n\t"
97 "jmp 11f\n\t"
98
99 /*
100 * Start to prepare for backward copy.
101 */
102 ".p2align 4\n\t"
103 "2:\n\t"
104 "cmp $680, %0\n\t"
105 "jb 5f\n\t"
106 "mov %1, %3\n\t"
107 "xor %2, %3\n\t"
108 "and $0xff, %3\n\t"
109 "jz 6b\n\t"
110
111 /*
112 * Calculate copy position to tail.
113 */
114 "5:\n\t"
115 "add %0, %1\n\t"
116 "add %0, %2\n\t"
117 "sub $0x10, %0\n\t"
118
119 /*
120 * We gobble 16byts backward in each loop.
121 */
122 "7:\n\t"
123 "sub $0x10, %0\n\t"
124
125 "mov -1*4(%1), %3\n\t"
126 "mov -2*4(%1), %4\n\t"
127 "mov %3, -1*4(%2)\n\t"
128 "mov %4, -2*4(%2)\n\t"
129 "mov -3*4(%1), %3\n\t"
130 "mov -4*4(%1), %4\n\t"
131 "mov %3, -3*4(%2)\n\t"
132 "mov %4, -4*4(%2)\n\t"
133 "lea -0x10(%1), %1\n\t"
134 "lea -0x10(%2), %2\n\t"
135 "jae 7b\n\t"
136 /*
137 * Calculate copy position to head.
138 */
139 "add $0x10, %0\n\t"
140 "sub %0, %1\n\t"
141 "sub %0, %2\n\t"
142
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 ".p2align 4\n\t"
147 "1:\n\t"
148 "cmp $8, %0\n\t"
149 "jb 8f\n\t"
150 "mov 0*4(%1), %3\n\t"
151 "mov 1*4(%1), %4\n\t"
152 "mov -2*4(%1, %0), %5\n\t"
153 "mov -1*4(%1, %0), %1\n\t"
154
155 "mov %3, 0*4(%2)\n\t"
156 "mov %4, 1*4(%2)\n\t"
157 "mov %5, -2*4(%2, %0)\n\t"
158 "mov %1, -1*4(%2, %0)\n\t"
159 "jmp 11f\n\t"
160
161 /*
162 * Move data from 4 bytes to 7 bytes.
163 */
164 ".p2align 4\n\t"
165 "8:\n\t"
166 "cmp $4, %0\n\t"
167 "jb 9f\n\t"
168 "mov 0*4(%1), %3\n\t"
169 "mov -1*4(%1, %0), %4\n\t"
170 "mov %3, 0*4(%2)\n\t"
171 "mov %4, -1*4(%2, %0)\n\t"
172 "jmp 11f\n\t"
173
174 /*
175 * Move data from 2 bytes to 3 bytes.
176 */
177 ".p2align 4\n\t"
178 "9:\n\t"
179 "cmp $2, %0\n\t"
180 "jb 10f\n\t"
181 "movw 0*2(%1), %%dx\n\t"
182 "movw -1*2(%1, %0), %%bx\n\t"
183 "movw %%dx, 0*2(%2)\n\t"
184 "movw %%bx, -1*2(%2, %0)\n\t"
185 "jmp 11f\n\t"
186
187 /*
188 * Move data for 1 byte.
189 */
190 ".p2align 4\n\t"
191 "10:\n\t"
192 "cmp $1, %0\n\t"
193 "jb 11f\n\t"
194 "movb (%1), %%cl\n\t"
195 "movb %%cl, (%2)\n\t"
196 ".p2align 4\n\t"
197 "11:"
198 : "=&c" (d0), "=&S" (d1), "=&D" (d2),
199 "=r" (d3),"=r" (d4), "=r"(d5)
200 :"0" (n),
201 "1" (src),
202 "2" (dest)
203 :"memory");
204
205 return ret;
206
42} 207}
43EXPORT_SYMBOL(memmove); 208EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..75ef61e35e38 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
40ENTRY(__memcpy) 40ENTRY(__memcpy)
41ENTRY(memcpy) 41ENTRY(memcpy)
42 CFI_STARTPROC 42 CFI_STARTPROC
43 movq %rdi, %rax
43 44
44 /* 45 /*
45 * Put the number of full 64-byte blocks into %ecx. 46 * Use 32bit CMP here to avoid long NOP padding.
46 * Tail portion is handled at the end:
47 */ 47 */
48 movq %rdi, %rax 48 cmp $0x20, %edx
49 movl %edx, %ecx 49 jb .Lhandle_tail
50 shrl $6, %ecx
51 jz .Lhandle_tail
52 50
53 .p2align 4
54.Lloop_64:
55 /* 51 /*
56 * We decrement the loop index here - and the zero-flag is 52 * We check whether memory false dependece could occur,
57 * checked at the end of the loop (instructions inbetween do 53 * then jump to corresponding copy mode.
58 * not change the zero flag):
59 */ 54 */
60 decl %ecx 55 cmp %dil, %sil
56 jl .Lcopy_backward
57 subl $0x20, %edx
58.Lcopy_forward_loop:
59 subq $0x20, %rdx
61 60
62 /* 61 /*
63 * Move in blocks of 4x16 bytes: 62 * Move in blocks of 4x8 bytes:
64 */ 63 */
65 movq 0*8(%rsi), %r11 64 movq 0*8(%rsi), %r8
66 movq 1*8(%rsi), %r8 65 movq 1*8(%rsi), %r9
67 movq %r11, 0*8(%rdi) 66 movq 2*8(%rsi), %r10
68 movq %r8, 1*8(%rdi) 67 movq 3*8(%rsi), %r11
69 68 leaq 4*8(%rsi), %rsi
70 movq 2*8(%rsi), %r9 69
71 movq 3*8(%rsi), %r10 70 movq %r8, 0*8(%rdi)
72 movq %r9, 2*8(%rdi) 71 movq %r9, 1*8(%rdi)
73 movq %r10, 3*8(%rdi) 72 movq %r10, 2*8(%rdi)
74 73 movq %r11, 3*8(%rdi)
75 movq 4*8(%rsi), %r11 74 leaq 4*8(%rdi), %rdi
76 movq 5*8(%rsi), %r8 75 jae .Lcopy_forward_loop
77 movq %r11, 4*8(%rdi) 76 addq $0x20, %rdx
78 movq %r8, 5*8(%rdi) 77 jmp .Lhandle_tail
79 78
80 movq 6*8(%rsi), %r9 79.Lcopy_backward:
81 movq 7*8(%rsi), %r10 80 /*
82 movq %r9, 6*8(%rdi) 81 * Calculate copy position to tail.
83 movq %r10, 7*8(%rdi) 82 */
84 83 addq %rdx, %rsi
85 leaq 64(%rsi), %rsi 84 addq %rdx, %rdi
86 leaq 64(%rdi), %rdi 85 subq $0x20, %rdx
87 86 /*
88 jnz .Lloop_64 87 * At most 3 ALU operations in one cycle,
88 * so append NOPS in the same 16bytes trunk.
89 */
90 .p2align 4
91.Lcopy_backward_loop:
92 subq $0x20, %rdx
93 movq -1*8(%rsi), %r8
94 movq -2*8(%rsi), %r9
95 movq -3*8(%rsi), %r10
96 movq -4*8(%rsi), %r11
97 leaq -4*8(%rsi), %rsi
98 movq %r8, -1*8(%rdi)
99 movq %r9, -2*8(%rdi)
100 movq %r10, -3*8(%rdi)
101 movq %r11, -4*8(%rdi)
102 leaq -4*8(%rdi), %rdi
103 jae .Lcopy_backward_loop
89 104
105 /*
106 * Calculate copy position to head.
107 */
108 addq $0x20, %rdx
109 subq %rdx, %rsi
110 subq %rdx, %rdi
90.Lhandle_tail: 111.Lhandle_tail:
91 movl %edx, %ecx 112 cmpq $16, %rdx
92 andl $63, %ecx 113 jb .Lless_16bytes
93 shrl $3, %ecx
94 jz .Lhandle_7
95 114
115 /*
116 * Move data from 16 bytes to 31 bytes.
117 */
118 movq 0*8(%rsi), %r8
119 movq 1*8(%rsi), %r9
120 movq -2*8(%rsi, %rdx), %r10
121 movq -1*8(%rsi, %rdx), %r11
122 movq %r8, 0*8(%rdi)
123 movq %r9, 1*8(%rdi)
124 movq %r10, -2*8(%rdi, %rdx)
125 movq %r11, -1*8(%rdi, %rdx)
126 retq
96 .p2align 4 127 .p2align 4
97.Lloop_8: 128.Lless_16bytes:
98 decl %ecx 129 cmpq $8, %rdx
99 movq (%rsi), %r8 130 jb .Lless_8bytes
100 movq %r8, (%rdi) 131 /*
101 leaq 8(%rdi), %rdi 132 * Move data from 8 bytes to 15 bytes.
102 leaq 8(%rsi), %rsi 133 */
103 jnz .Lloop_8 134 movq 0*8(%rsi), %r8
104 135 movq -1*8(%rsi, %rdx), %r9
105.Lhandle_7: 136 movq %r8, 0*8(%rdi)
106 movl %edx, %ecx 137 movq %r9, -1*8(%rdi, %rdx)
107 andl $7, %ecx 138 retq
108 jz .Lend 139 .p2align 4
140.Lless_8bytes:
141 cmpq $4, %rdx
142 jb .Lless_3bytes
109 143
144 /*
145 * Move data from 4 bytes to 7 bytes.
146 */
147 movl (%rsi), %ecx
148 movl -4(%rsi, %rdx), %r8d
149 movl %ecx, (%rdi)
150 movl %r8d, -4(%rdi, %rdx)
151 retq
110 .p2align 4 152 .p2align 4
153.Lless_3bytes:
154 cmpl $0, %edx
155 je .Lend
156 /*
157 * Move data from 1 bytes to 3 bytes.
158 */
111.Lloop_1: 159.Lloop_1:
112 movb (%rsi), %r8b 160 movb (%rsi), %r8b
113 movb %r8b, (%rdi) 161 movb %r8b, (%rdi)
114 incq %rdi 162 incq %rdi
115 incq %rsi 163 incq %rsi
116 decl %ecx 164 decl %edx
117 jnz .Lloop_1 165 jnz .Lloop_1
118 166
119.Lend: 167.Lend:
120 ret 168 retq
121 CFI_ENDPROC 169 CFI_ENDPROC
122ENDPROC(memcpy) 170ENDPROC(memcpy)
123ENDPROC(__memcpy) 171ENDPROC(__memcpy)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909bf122..6d0f0ec41b34 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,185 @@
8#undef memmove 8#undef memmove
9void *memmove(void *dest, const void *src, size_t count) 9void *memmove(void *dest, const void *src, size_t count)
10{ 10{
11 if (dest < src) { 11 unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12 return memcpy(dest, src, count); 12 char *ret;
13 } else { 13
14 char *p = dest + count; 14 __asm__ __volatile__(
15 const char *s = src + count; 15 /* Handle more 32bytes in loop */
16 while (count--) 16 "mov %2, %3\n\t"
17 *--p = *--s; 17 "cmp $0x20, %0\n\t"
18 } 18 "jb 1f\n\t"
19 return dest; 19
20 /* Decide forward/backward copy mode */
21 "cmp %2, %1\n\t"
22 "jb 2f\n\t"
23
24 /*
25 * movsq instruction have many startup latency
26 * so we handle small size by general register.
27 */
28 "cmp $680, %0\n\t"
29 "jb 3f\n\t"
30 /*
31 * movsq instruction is only good for aligned case.
32 */
33 "cmpb %%dil, %%sil\n\t"
34 "je 4f\n\t"
35 "3:\n\t"
36 "sub $0x20, %0\n\t"
37 /*
38 * We gobble 32byts forward in each loop.
39 */
40 "5:\n\t"
41 "sub $0x20, %0\n\t"
42 "movq 0*8(%1), %4\n\t"
43 "movq 1*8(%1), %5\n\t"
44 "movq 2*8(%1), %6\n\t"
45 "movq 3*8(%1), %7\n\t"
46 "leaq 4*8(%1), %1\n\t"
47
48 "movq %4, 0*8(%2)\n\t"
49 "movq %5, 1*8(%2)\n\t"
50 "movq %6, 2*8(%2)\n\t"
51 "movq %7, 3*8(%2)\n\t"
52 "leaq 4*8(%2), %2\n\t"
53 "jae 5b\n\t"
54 "addq $0x20, %0\n\t"
55 "jmp 1f\n\t"
56 /*
57 * Handle data forward by movsq.
58 */
59 ".p2align 4\n\t"
60 "4:\n\t"
61 "movq %0, %8\n\t"
62 "movq -8(%1, %0), %4\n\t"
63 "lea -8(%2, %0), %5\n\t"
64 "shrq $3, %8\n\t"
65 "rep movsq\n\t"
66 "movq %4, (%5)\n\t"
67 "jmp 13f\n\t"
68 /*
69 * Handle data backward by movsq.
70 */
71 ".p2align 4\n\t"
72 "7:\n\t"
73 "movq %0, %8\n\t"
74 "movq (%1), %4\n\t"
75 "movq %2, %5\n\t"
76 "leaq -8(%1, %0), %1\n\t"
77 "leaq -8(%2, %0), %2\n\t"
78 "shrq $3, %8\n\t"
79 "std\n\t"
80 "rep movsq\n\t"
81 "cld\n\t"
82 "movq %4, (%5)\n\t"
83 "jmp 13f\n\t"
84
85 /*
86 * Start to prepare for backward copy.
87 */
88 ".p2align 4\n\t"
89 "2:\n\t"
90 "cmp $680, %0\n\t"
91 "jb 6f \n\t"
92 "cmp %%dil, %%sil\n\t"
93 "je 7b \n\t"
94 "6:\n\t"
95 /*
96 * Calculate copy position to tail.
97 */
98 "addq %0, %1\n\t"
99 "addq %0, %2\n\t"
100 "subq $0x20, %0\n\t"
101 /*
102 * We gobble 32byts backward in each loop.
103 */
104 "8:\n\t"
105 "subq $0x20, %0\n\t"
106 "movq -1*8(%1), %4\n\t"
107 "movq -2*8(%1), %5\n\t"
108 "movq -3*8(%1), %6\n\t"
109 "movq -4*8(%1), %7\n\t"
110 "leaq -4*8(%1), %1\n\t"
111
112 "movq %4, -1*8(%2)\n\t"
113 "movq %5, -2*8(%2)\n\t"
114 "movq %6, -3*8(%2)\n\t"
115 "movq %7, -4*8(%2)\n\t"
116 "leaq -4*8(%2), %2\n\t"
117 "jae 8b\n\t"
118 /*
119 * Calculate copy position to head.
120 */
121 "addq $0x20, %0\n\t"
122 "subq %0, %1\n\t"
123 "subq %0, %2\n\t"
124 "1:\n\t"
125 "cmpq $16, %0\n\t"
126 "jb 9f\n\t"
127 /*
128 * Move data from 16 bytes to 31 bytes.
129 */
130 "movq 0*8(%1), %4\n\t"
131 "movq 1*8(%1), %5\n\t"
132 "movq -2*8(%1, %0), %6\n\t"
133 "movq -1*8(%1, %0), %7\n\t"
134 "movq %4, 0*8(%2)\n\t"
135 "movq %5, 1*8(%2)\n\t"
136 "movq %6, -2*8(%2, %0)\n\t"
137 "movq %7, -1*8(%2, %0)\n\t"
138 "jmp 13f\n\t"
139 ".p2align 4\n\t"
140 "9:\n\t"
141 "cmpq $8, %0\n\t"
142 "jb 10f\n\t"
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 "movq 0*8(%1), %4\n\t"
147 "movq -1*8(%1, %0), %5\n\t"
148 "movq %4, 0*8(%2)\n\t"
149 "movq %5, -1*8(%2, %0)\n\t"
150 "jmp 13f\n\t"
151 "10:\n\t"
152 "cmpq $4, %0\n\t"
153 "jb 11f\n\t"
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 "movl (%1), %4d\n\t"
158 "movl -4(%1, %0), %5d\n\t"
159 "movl %4d, (%2)\n\t"
160 "movl %5d, -4(%2, %0)\n\t"
161 "jmp 13f\n\t"
162 "11:\n\t"
163 "cmp $2, %0\n\t"
164 "jb 12f\n\t"
165 /*
166 * Move data from 2 bytes to 3 bytes.
167 */
168 "movw (%1), %4w\n\t"
169 "movw -2(%1, %0), %5w\n\t"
170 "movw %4w, (%2)\n\t"
171 "movw %5w, -2(%2, %0)\n\t"
172 "jmp 13f\n\t"
173 "12:\n\t"
174 "cmp $1, %0\n\t"
175 "jb 13f\n\t"
176 /*
177 * Move data for 1 byte.
178 */
179 "movb (%1), %4b\n\t"
180 "movb %4b, (%2)\n\t"
181 "13:\n\t"
182 : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184 :"0" (count),
185 "1" (src),
186 "2" (dest)
187 :"memory");
188
189 return ret;
190
20} 191}
21EXPORT_SYMBOL(memmove); 192EXPORT_SYMBOL(memmove);