aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/lib/memcpy.S
diff options
context:
space:
mode:
authorNicolas Pitre <nico@cam.org>2005-11-01 14:52:23 -0500
committerRussell King <rmk+kernel@arm.linux.org.uk>2005-11-01 14:52:23 -0500
commit7549423000fc38d39a8b81c601dea0332c113a42 (patch)
tree6b76fe2867b9634a1d1dbaf682c69ccad4e9f71b /arch/arm/lib/memcpy.S
parenta0c6fdb987860e6c7f9b8e57439ca2703f462578 (diff)
[ARM] 2947/1: copy template with new memcpy/memmove
Patch from Nicolas Pitre This patch provides a new implementation for optimized memory copy functions on ARM. It is made of two levels: a template that consists of the core copy code and separate files that define macros to be used with the core code depending on the type of copy needed. This allows for best performances while sharing the same core for implementing memcpy(), copy_from_user() and copy_to_user() for instance. Two reasons for this work: 1) the current copy_to_user/copy_from_user implementation assumes no task switch will ever occur in the middle of each copied page making it completely unsafe with CONFIG_PREEMPT=y. 2) current copy implementations are measurably suboptimal and optimizing different implementations separately is a pain and more opportunities for bugs. The reason for (1) is the fact that copy inside user pages are performed with the ldm instruction which has no mean for testing user protections and could possibly race with process preemption bypassing the COW mechanism for example. This is a longstanding issue that we said ought to be fixed for about two years now. The solution is to substitute those ldm insns with a series of ldrt or strt insns to enforce user memory protection. At least on StrongARM and XScale cores the ldm is not faster than the equivalent ldr/str insns with a warm i-cache so there is no measurable performance degradation with that change. The fact that the copy code is a template makes it pretty easy to reuse the same core code as for memcpy and benefit from the same performance optimizations. Now (2) is best demonstrated with actual throughput measurements. First, here is a summary of memcopy tests performed on a StrongARM core: PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 59.73 107.43 unaligned 32 61.31 74.72 aligned 100 132.47 136.15 unaligned 100 103.84 123.76 aligned 4096 130.67 130.80 unaligned 4096 130.68 130.64 aligned 1048576 68.03 68.18 unaligned 1048576 68.03 68.18 The buffer size is in bytes and the measured speed in MB/s. The copy was performed repeatedly with given buffer and throughput averaged over 3 seconds. Here we can see that the current kernel version has a higher entry cost that shows up with small buffers. As buffer size grows both implementation converge to the same throughput. Now here's the exact same test performed on an XScale core (PXA255): PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 46.99 77.58 unaligned 32 53.61 59.59 aligned 100 107.19 136.59 unaligned 100 83.61 97.58 aligned 4096 129.13 129.98 unaligned 4096 128.36 128.53 aligned 1048576 53.76 59.41 unaligned 1048576 33.67 56.96 Again we can see the entry setup cost being higher for the current kernel before getting to the main copy loop. Then throughput results converge as long as the buffer remains in the cache. Then the 1MB case shows more differences probably due to better pld placement and/or less instruction interlocks in this proposed implementation. Disclaimer: The PXA system was running with slower clocks than the StrongARM system so trying to infer any conclusion by comparing those separate sets of results side by side would be completely inappropriate. So... What this patch does is to replace both memcpy and memmove with an implementation based on the provided copy code template. The memmove code is kept separate since it is used only if the memory areas involved do overlap in which case the code is a transposition of the template but with the copy occurring in the opposite direction (trying to fit that mode into the template turned it into a mess not worth it for memmove alone). And obviously both memcpy and memmove were tested with all kinds of pointer alignments and buffer sizes to exercise all code paths for correctness. The next patch will provide the now trivial replacement implementation copy_to_user and copy_from_user. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Diffstat (limited to 'arch/arm/lib/memcpy.S')
-rw-r--r--arch/arm/lib/memcpy.S410
1 files changed, 38 insertions, 372 deletions
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index f5a593ceb8cc..7e71d6708a8d 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -1,393 +1,59 @@
1/* 1/*
2 * linux/arch/arm/lib/memcpy.S 2 * linux/arch/arm/lib/memcpy.S
3 * 3 *
4 * Copyright (C) 1995-1999 Russell King 4 * Author: Nicolas Pitre
5 * Created: Sep 28, 2005
6 * Copyright: MontaVista Software, Inc.
5 * 7 *
6 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
9 *
10 * ASM optimised string functions
11 */ 11 */
12
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <asm/assembler.h> 14#include <asm/assembler.h>
14 15
15 .text 16 .macro ldr1w ptr reg abort
16 17 ldr \reg, [\ptr], #4
17#define ENTER \ 18 .endm
18 mov ip,sp ;\
19 stmfd sp!,{r0,r4-r9,fp,ip,lr,pc} ;\
20 sub fp,ip,#4
21
22#define EXIT \
23 LOADREGS(ea, fp, {r0, r4 - r9, fp, sp, pc})
24
25#define EXITEQ \
26 LOADREGS(eqea, fp, {r0, r4 - r9, fp, sp, pc})
27
28/*
29 * Prototype: void memcpy(void *to,const void *from,unsigned long n);
30 */
31ENTRY(memcpy)
32ENTRY(memmove)
33 ENTER
34 cmp r1, r0
35 bcc 23f
36 subs r2, r2, #4
37 blt 6f
38 PLD( pld [r1, #0] )
39 ands ip, r0, #3
40 bne 7f
41 ands ip, r1, #3
42 bne 8f
43 19
441: subs r2, r2, #8 20 .macro ldr4w ptr reg1 reg2 reg3 reg4 abort
45 blt 5f 21 ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
46 subs r2, r2, #20 22 .endm
47 blt 4f
48 PLD( pld [r1, #28] )
49 PLD( subs r2, r2, #64 )
50 PLD( blt 3f )
512: PLD( pld [r1, #60] )
52 PLD( pld [r1, #92] )
53 ldmia r1!, {r3 - r9, ip}
54 subs r2, r2, #32
55 stmgeia r0!, {r3 - r9, ip}
56 ldmgeia r1!, {r3 - r9, ip}
57 subges r2, r2, #32
58 stmia r0!, {r3 - r9, ip}
59 bge 2b
603: PLD( ldmia r1!, {r3 - r9, ip} )
61 PLD( adds r2, r2, #32 )
62 PLD( stmgeia r0!, {r3 - r9, ip} )
63 PLD( ldmgeia r1!, {r3 - r9, ip} )
64 PLD( subges r2, r2, #32 )
65 PLD( stmia r0!, {r3 - r9, ip} )
664: cmn r2, #16
67 ldmgeia r1!, {r3 - r6}
68 subge r2, r2, #16
69 stmgeia r0!, {r3 - r6}
70 adds r2, r2, #20
71 ldmgeia r1!, {r3 - r5}
72 subge r2, r2, #12
73 stmgeia r0!, {r3 - r5}
745: adds r2, r2, #8
75 blt 6f
76 subs r2, r2, #4
77 ldrlt r3, [r1], #4
78 ldmgeia r1!, {r4, r5}
79 subge r2, r2, #4
80 strlt r3, [r0], #4
81 stmgeia r0!, {r4, r5}
82 23
836: adds r2, r2, #4 24 .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
84 EXITEQ 25 ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
85 cmp r2, #2 26 .endm
86 ldrb r3, [r1], #1
87 ldrgeb r4, [r1], #1
88 ldrgtb r5, [r1], #1
89 strb r3, [r0], #1
90 strgeb r4, [r0], #1
91 strgtb r5, [r0], #1
92 EXIT
93 27
947: rsb ip, ip, #4 28 .macro ldr1b ptr reg cond=al abort
95 cmp ip, #2 29 ldr\cond\()b \reg, [\ptr], #1
96 ldrb r3, [r1], #1 30 .endm
97 ldrgeb r4, [r1], #1
98 ldrgtb r5, [r1], #1
99 strb r3, [r0], #1
100 strgeb r4, [r0], #1
101 strgtb r5, [r0], #1
102 subs r2, r2, ip
103 blt 6b
104 ands ip, r1, #3
105 beq 1b
106 31
1078: bic r1, r1, #3 32 .macro str1w ptr reg abort
108 ldr r7, [r1], #4 33 str \reg, [\ptr], #4
109 cmp ip, #2 34 .endm
110 bgt 18f
111 beq 13f
112 cmp r2, #12
113 blt 11f
114 PLD( pld [r1, #12] )
115 sub r2, r2, #12
116 PLD( subs r2, r2, #32 )
117 PLD( blt 10f )
118 PLD( pld [r1, #28] )
1199: PLD( pld [r1, #44] )
12010: mov r3, r7, pull #8
121 ldmia r1!, {r4 - r7}
122 subs r2, r2, #16
123 orr r3, r3, r4, push #24
124 mov r4, r4, pull #8
125 orr r4, r4, r5, push #24
126 mov r5, r5, pull #8
127 orr r5, r5, r6, push #24
128 mov r6, r6, pull #8
129 orr r6, r6, r7, push #24
130 stmia r0!, {r3 - r6}
131 bge 9b
132 PLD( cmn r2, #32 )
133 PLD( bge 10b )
134 PLD( add r2, r2, #32 )
135 adds r2, r2, #12
136 blt 12f
13711: mov r3, r7, pull #8
138 ldr r7, [r1], #4
139 subs r2, r2, #4
140 orr r3, r3, r7, push #24
141 str r3, [r0], #4
142 bge 11b
14312: sub r1, r1, #3
144 b 6b
145 35
14613: cmp r2, #12 36 .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
147 blt 16f 37 stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
148 PLD( pld [r1, #12] ) 38 .endm
149 sub r2, r2, #12
150 PLD( subs r2, r2, #32 )
151 PLD( blt 15f )
152 PLD( pld [r1, #28] )
15314: PLD( pld [r1, #44] )
15415: mov r3, r7, pull #16
155 ldmia r1!, {r4 - r7}
156 subs r2, r2, #16
157 orr r3, r3, r4, push #16
158 mov r4, r4, pull #16
159 orr r4, r4, r5, push #16
160 mov r5, r5, pull #16
161 orr r5, r5, r6, push #16
162 mov r6, r6, pull #16
163 orr r6, r6, r7, push #16
164 stmia r0!, {r3 - r6}
165 bge 14b
166 PLD( cmn r2, #32 )
167 PLD( bge 15b )
168 PLD( add r2, r2, #32 )
169 adds r2, r2, #12
170 blt 17f
17116: mov r3, r7, pull #16
172 ldr r7, [r1], #4
173 subs r2, r2, #4
174 orr r3, r3, r7, push #16
175 str r3, [r0], #4
176 bge 16b
17717: sub r1, r1, #2
178 b 6b
179 39
18018: cmp r2, #12 40 .macro str1b ptr reg cond=al abort
181 blt 21f 41 str\cond\()b \reg, [\ptr], #1
182 PLD( pld [r1, #12] ) 42 .endm
183 sub r2, r2, #12
184 PLD( subs r2, r2, #32 )
185 PLD( blt 20f )
186 PLD( pld [r1, #28] )
18719: PLD( pld [r1, #44] )
18820: mov r3, r7, pull #24
189 ldmia r1!, {r4 - r7}
190 subs r2, r2, #16
191 orr r3, r3, r4, push #8
192 mov r4, r4, pull #24
193 orr r4, r4, r5, push #8
194 mov r5, r5, pull #24
195 orr r5, r5, r6, push #8
196 mov r6, r6, pull #24
197 orr r6, r6, r7, push #8
198 stmia r0!, {r3 - r6}
199 bge 19b
200 PLD( cmn r2, #32 )
201 PLD( bge 20b )
202 PLD( add r2, r2, #32 )
203 adds r2, r2, #12
204 blt 22f
20521: mov r3, r7, pull #24
206 ldr r7, [r1], #4
207 subs r2, r2, #4
208 orr r3, r3, r7, push #8
209 str r3, [r0], #4
210 bge 21b
21122: sub r1, r1, #1
212 b 6b
213 43
44 .macro enter reg1 reg2
45 stmdb sp!, {r0, \reg1, \reg2}
46 .endm
214 47
21523: add r1, r1, r2 48 .macro exit reg1 reg2
216 add r0, r0, r2 49 ldmfd sp!, {r0, \reg1, \reg2}
217 subs r2, r2, #4 50 .endm
218 blt 29f
219 PLD( pld [r1, #-4] )
220 ands ip, r0, #3
221 bne 30f
222 ands ip, r1, #3
223 bne 31f
224 51
22524: subs r2, r2, #8 52 .text
226 blt 28f
227 subs r2, r2, #20
228 blt 27f
229 PLD( pld [r1, #-32] )
230 PLD( subs r2, r2, #64 )
231 PLD( blt 26f )
23225: PLD( pld [r1, #-64] )
233 PLD( pld [r1, #-96] )
234 ldmdb r1!, {r3 - r9, ip}
235 subs r2, r2, #32
236 stmgedb r0!, {r3 - r9, ip}
237 ldmgedb r1!, {r3 - r9, ip}
238 subges r2, r2, #32
239 stmdb r0!, {r3 - r9, ip}
240 bge 25b
24126: PLD( ldmdb r1!, {r3 - r9, ip} )
242 PLD( adds r2, r2, #32 )
243 PLD( stmgedb r0!, {r3 - r9, ip} )
244 PLD( ldmgedb r1!, {r3 - r9, ip} )
245 PLD( subges r2, r2, #32 )
246 PLD( stmdb r0!, {r3 - r9, ip} )
24727: cmn r2, #16
248 ldmgedb r1!, {r3 - r6}
249 subge r2, r2, #16
250 stmgedb r0!, {r3 - r6}
251 adds r2, r2, #20
252 ldmgedb r1!, {r3 - r5}
253 subge r2, r2, #12
254 stmgedb r0!, {r3 - r5}
25528: adds r2, r2, #8
256 blt 29f
257 subs r2, r2, #4
258 ldrlt r3, [r1, #-4]!
259 ldmgedb r1!, {r4, r5}
260 subge r2, r2, #4
261 strlt r3, [r0, #-4]!
262 stmgedb r0!, {r4, r5}
263 53
26429: adds r2, r2, #4 54/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
265 EXITEQ
266 cmp r2, #2
267 ldrb r3, [r1, #-1]!
268 ldrgeb r4, [r1, #-1]!
269 ldrgtb r5, [r1, #-1]!
270 strb r3, [r0, #-1]!
271 strgeb r4, [r0, #-1]!
272 strgtb r5, [r0, #-1]!
273 EXIT
274 55
27530: cmp ip, #2 56ENTRY(memcpy)
276 ldrb r3, [r1, #-1]!
277 ldrgeb r4, [r1, #-1]!
278 ldrgtb r5, [r1, #-1]!
279 strb r3, [r0, #-1]!
280 strgeb r4, [r0, #-1]!
281 strgtb r5, [r0, #-1]!
282 subs r2, r2, ip
283 blt 29b
284 ands ip, r1, #3
285 beq 24b
286
28731: bic r1, r1, #3
288 ldr r3, [r1], #0
289 cmp ip, #2
290 blt 41f
291 beq 36f
292 cmp r2, #12
293 blt 34f
294 PLD( pld [r1, #-16] )
295 sub r2, r2, #12
296 PLD( subs r2, r2, #32 )
297 PLD( blt 33f )
298 PLD( pld [r1, #-32] )
29932: PLD( pld [r1, #-48] )
30033: mov r7, r3, push #8
301 ldmdb r1!, {r3, r4, r5, r6}
302 subs r2, r2, #16
303 orr r7, r7, r6, pull #24
304 mov r6, r6, push #8
305 orr r6, r6, r5, pull #24
306 mov r5, r5, push #8
307 orr r5, r5, r4, pull #24
308 mov r4, r4, push #8
309 orr r4, r4, r3, pull #24
310 stmdb r0!, {r4, r5, r6, r7}
311 bge 32b
312 PLD( cmn r2, #32 )
313 PLD( bge 33b )
314 PLD( add r2, r2, #32 )
315 adds r2, r2, #12
316 blt 35f
31734: mov ip, r3, push #8
318 ldr r3, [r1, #-4]!
319 subs r2, r2, #4
320 orr ip, ip, r3, pull #24
321 str ip, [r0, #-4]!
322 bge 34b
32335: add r1, r1, #3
324 b 29b
325
32636: cmp r2, #12
327 blt 39f
328 PLD( pld [r1, #-16] )
329 sub r2, r2, #12
330 PLD( subs r2, r2, #32 )
331 PLD( blt 38f )
332 PLD( pld [r1, #-32] )
33337: PLD( pld [r1, #-48] )
33438: mov r7, r3, push #16
335 ldmdb r1!, {r3, r4, r5, r6}
336 subs r2, r2, #16
337 orr r7, r7, r6, pull #16
338 mov r6, r6, push #16
339 orr r6, r6, r5, pull #16
340 mov r5, r5, push #16
341 orr r5, r5, r4, pull #16
342 mov r4, r4, push #16
343 orr r4, r4, r3, pull #16
344 stmdb r0!, {r4, r5, r6, r7}
345 bge 37b
346 PLD( cmn r2, #32 )
347 PLD( bge 38b )
348 PLD( add r2, r2, #32 )
349 adds r2, r2, #12
350 blt 40f
35139: mov ip, r3, push #16
352 ldr r3, [r1, #-4]!
353 subs r2, r2, #4
354 orr ip, ip, r3, pull #16
355 str ip, [r0, #-4]!
356 bge 39b
35740: add r1, r1, #2
358 b 29b
359 57
36041: cmp r2, #12 58#include "copy_template.S"
361 blt 44f
362 PLD( pld [r1, #-16] )
363 sub r2, r2, #12
364 PLD( subs r2, r2, #32 )
365 PLD( blt 43f )
366 PLD( pld [r1, #-32] )
36742: PLD( pld [r1, #-48] )
36843: mov r7, r3, push #24
369 ldmdb r1!, {r3, r4, r5, r6}
370 subs r2, r2, #16
371 orr r7, r7, r6, pull #8
372 mov r6, r6, push #24
373 orr r6, r6, r5, pull #8
374 mov r5, r5, push #24
375 orr r5, r5, r4, pull #8
376 mov r4, r4, push #24
377 orr r4, r4, r3, pull #8
378 stmdb r0!, {r4, r5, r6, r7}
379 bge 42b
380 PLD( cmn r2, #32 )
381 PLD( bge 43b )
382 PLD( add r2, r2, #32 )
383 adds r2, r2, #12
384 blt 45f
38544: mov ip, r3, push #24
386 ldr r3, [r1, #-4]!
387 subs r2, r2, #4
388 orr ip, ip, r3, pull #8
389 str ip, [r0, #-4]!
390 bge 44b
39145: add r1, r1, #1
392 b 29b
393 59