aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNicolas Pitre <nico@cam.org>2005-11-01 14:52:23 -0500
committerRussell King <rmk+kernel@arm.linux.org.uk>2005-11-01 14:52:23 -0500
commit7549423000fc38d39a8b81c601dea0332c113a42 (patch)
tree6b76fe2867b9634a1d1dbaf682c69ccad4e9f71b
parenta0c6fdb987860e6c7f9b8e57439ca2703f462578 (diff)
[ARM] 2947/1: copy template with new memcpy/memmove
Patch from Nicolas Pitre This patch provides a new implementation for optimized memory copy functions on ARM. It is made of two levels: a template that consists of the core copy code and separate files that define macros to be used with the core code depending on the type of copy needed. This allows for best performances while sharing the same core for implementing memcpy(), copy_from_user() and copy_to_user() for instance. Two reasons for this work: 1) the current copy_to_user/copy_from_user implementation assumes no task switch will ever occur in the middle of each copied page making it completely unsafe with CONFIG_PREEMPT=y. 2) current copy implementations are measurably suboptimal and optimizing different implementations separately is a pain and more opportunities for bugs. The reason for (1) is the fact that copy inside user pages are performed with the ldm instruction which has no mean for testing user protections and could possibly race with process preemption bypassing the COW mechanism for example. This is a longstanding issue that we said ought to be fixed for about two years now. The solution is to substitute those ldm insns with a series of ldrt or strt insns to enforce user memory protection. At least on StrongARM and XScale cores the ldm is not faster than the equivalent ldr/str insns with a warm i-cache so there is no measurable performance degradation with that change. The fact that the copy code is a template makes it pretty easy to reuse the same core code as for memcpy and benefit from the same performance optimizations. Now (2) is best demonstrated with actual throughput measurements. First, here is a summary of memcopy tests performed on a StrongARM core: PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 59.73 107.43 unaligned 32 61.31 74.72 aligned 100 132.47 136.15 unaligned 100 103.84 123.76 aligned 4096 130.67 130.80 unaligned 4096 130.68 130.64 aligned 1048576 68.03 68.18 unaligned 1048576 68.03 68.18 The buffer size is in bytes and the measured speed in MB/s. The copy was performed repeatedly with given buffer and throughput averaged over 3 seconds. Here we can see that the current kernel version has a higher entry cost that shows up with small buffers. As buffer size grows both implementation converge to the same throughput. Now here's the exact same test performed on an XScale core (PXA255): PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 46.99 77.58 unaligned 32 53.61 59.59 aligned 100 107.19 136.59 unaligned 100 83.61 97.58 aligned 4096 129.13 129.98 unaligned 4096 128.36 128.53 aligned 1048576 53.76 59.41 unaligned 1048576 33.67 56.96 Again we can see the entry setup cost being higher for the current kernel before getting to the main copy loop. Then throughput results converge as long as the buffer remains in the cache. Then the 1MB case shows more differences probably due to better pld placement and/or less instruction interlocks in this proposed implementation. Disclaimer: The PXA system was running with slower clocks than the StrongARM system so trying to infer any conclusion by comparing those separate sets of results side by side would be completely inappropriate. So... What this patch does is to replace both memcpy and memmove with an implementation based on the provided copy code template. The memmove code is kept separate since it is used only if the memory areas involved do overlap in which case the code is a transposition of the template but with the copy occurring in the opposite direction (trying to fit that mode into the template turned it into a mess not worth it for memmove alone). And obviously both memcpy and memmove were tested with all kinds of pointer alignments and buffer sizes to exercise all code paths for correctness. The next patch will provide the now trivial replacement implementation copy_to_user and copy_from_user. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
-rw-r--r--arch/arm/lib/Makefile5
-rw-r--r--arch/arm/lib/copy_template.S255
-rw-r--r--arch/arm/lib/memcpy.S410
-rw-r--r--arch/arm/lib/memmove.S206
4 files changed, 502 insertions, 374 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index d3d9b21eb7e4..8f9770f1af19 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -7,8 +7,9 @@
7lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \ 7lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
8 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \ 8 csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
9 copy_page.o delay.o findbit.o memchr.o memcpy.o \ 9 copy_page.o delay.o findbit.o memchr.o memcpy.o \
10 memset.o memzero.o setbit.o strncpy_from_user.o \ 10 memmove.o memset.o memzero.o setbit.o \
11 strnlen_user.o strchr.o strrchr.o testchangebit.o \ 11 strncpy_from_user.o strnlen_user.o \
12 strchr.o strrchr.o testchangebit.o \
12 testclearbit.o testsetbit.o uaccess.o \ 13 testclearbit.o testsetbit.o uaccess.o \
13 getuser.o putuser.o clear_user.o \ 14 getuser.o putuser.o clear_user.o \
14 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \ 15 ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
new file mode 100644
index 000000000000..838e435e4922
--- /dev/null
+++ b/arch/arm/lib/copy_template.S
@@ -0,0 +1,255 @@
1/*
2 * linux/arch/arm/lib/copy_template.s
3 *
4 * Code template for optimized memory copy functions
5 *
6 * Author: Nicolas Pitre
7 * Created: Sep 28, 2005
8 * Copyright: MontaVista Software, Inc.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15/*
16 * This can be used to enable code to cacheline align the source pointer.
17 * Experiments on tested architectures (StrongARM and XScale) didn't show
18 * this a worthwhile thing to do. That might be different in the future.
19 */
20//#define CALGN(code...) code
21#define CALGN(code...)
22
23/*
24 * Theory of operation
25 * -------------------
26 *
27 * This file provides the core code for a forward memory copy used in
28 * the implementation of memcopy(), copy_to_user() and copy_from_user().
29 *
30 * The including file must define the following accessor macros
31 * according to the need of the given function:
32 *
33 * ldr1w ptr reg abort
34 *
35 * This loads one word from 'ptr', stores it in 'reg' and increments
36 * 'ptr' to the next word. The 'abort' argument is used for fixup tables.
37 *
38 * ldr4w ptr reg1 reg2 reg3 reg4 abort
39 * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
40 *
41 * This loads four or eight words starting from 'ptr', stores them
42 * in provided registers and increments 'ptr' past those words.
43 * The'abort' argument is used for fixup tables.
44 *
45 * ldr1b ptr reg cond abort
46 *
47 * Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
48 * It also must apply the condition code if provided, otherwise the
49 * "al" condition is assumed by default.
50 *
51 * str1w ptr reg abort
52 * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
53 * str1b ptr reg cond abort
54 *
55 * Same as their ldr* counterparts, but data is stored to 'ptr' location
56 * rather than being loaded.
57 *
58 * enter reg1 reg2
59 *
60 * Preserve the provided registers on the stack plus any additional
61 * data as needed by the implementation including this code. Called
62 * upon code entry.
63 *
64 * exit reg1 reg2
65 *
66 * Restore registers with the values previously saved with the
67 * 'preserv' macro. Called upon code termination.
68 */
69
70
71 enter r4, lr
72
73 subs r2, r2, #4
74 blt 8f
75 ands ip, r0, #3
76 PLD( pld [r1, #0] )
77 bne 9f
78 ands ip, r1, #3
79 bne 10f
80
811: subs r2, r2, #(28)
82 stmfd sp!, {r5 - r8}
83 blt 5f
84
85 CALGN( ands ip, r1, #31 )
86 CALGN( rsb r3, ip, #32 )
87 CALGN( sbcnes r4, r3, r2 ) @ C is always set here
88 CALGN( bcs 2f )
89 CALGN( adr r4, 6f )
90 CALGN( subs r2, r2, r3 ) @ C gets set
91 CALGN( add pc, r4, ip )
92
93 PLD( pld [r1, #0] )
942: PLD( subs r2, r2, #96 )
95 PLD( pld [r1, #28] )
96 PLD( blt 4f )
97 PLD( pld [r1, #60] )
98 PLD( pld [r1, #92] )
99
1003: PLD( pld [r1, #124] )
1014: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
102 subs r2, r2, #32
103 str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
104 bge 3b
105 PLD( cmn r2, #96 )
106 PLD( bge 4b )
107
1085: ands ip, r2, #28
109 rsb ip, ip, #32
110 addne pc, pc, ip @ C is always clear here
111 b 7f
1126: nop
113 ldr1w r1, r3, abort=20f
114 ldr1w r1, r4, abort=20f
115 ldr1w r1, r5, abort=20f
116 ldr1w r1, r6, abort=20f
117 ldr1w r1, r7, abort=20f
118 ldr1w r1, r8, abort=20f
119 ldr1w r1, lr, abort=20f
120
121 add pc, pc, ip
122 nop
123 nop
124 str1w r0, r3, abort=20f
125 str1w r0, r4, abort=20f
126 str1w r0, r5, abort=20f
127 str1w r0, r6, abort=20f
128 str1w r0, r7, abort=20f
129 str1w r0, r8, abort=20f
130 str1w r0, lr, abort=20f
131
132 CALGN( bcs 2b )
133
1347: ldmfd sp!, {r5 - r8}
135
1368: movs r2, r2, lsl #31
137 ldr1b r1, r3, ne, abort=21f
138 ldr1b r1, r4, cs, abort=21f
139 ldr1b r1, ip, cs, abort=21f
140 str1b r0, r3, ne, abort=21f
141 str1b r0, r4, cs, abort=21f
142 str1b r0, ip, cs, abort=21f
143
144 exit r4, pc
145
1469: rsb ip, ip, #4
147 cmp ip, #2
148 ldr1b r1, r3, gt, abort=21f
149 ldr1b r1, r4, ge, abort=21f
150 ldr1b r1, lr, abort=21f
151 str1b r0, r3, gt, abort=21f
152 str1b r0, r4, ge, abort=21f
153 subs r2, r2, ip
154 str1b r0, lr, abort=21f
155 blt 8b
156 ands ip, r1, #3
157 beq 1b
158
15910: bic r1, r1, #3
160 cmp ip, #2
161 ldr1w r1, lr, abort=21f
162 beq 17f
163 bgt 18f
164
165
166 .macro forward_copy_shift pull push
167
168 subs r2, r2, #28
169 blt 14f
170
171 CALGN( ands ip, r1, #31 )
172 CALGN( rsb ip, ip, #32 )
173 CALGN( sbcnes r4, ip, r2 ) @ C is always set here
174 CALGN( subcc r2, r2, ip )
175 CALGN( bcc 15f )
176
17711: stmfd sp!, {r5 - r9}
178
179 PLD( pld [r1, #0] )
180 PLD( subs r2, r2, #96 )
181 PLD( pld [r1, #28] )
182 PLD( blt 13f )
183 PLD( pld [r1, #60] )
184 PLD( pld [r1, #92] )
185
18612: PLD( pld [r1, #124] )
18713: ldr4w r1, r4, r5, r6, r7, abort=19f
188 mov r3, lr, pull #\pull
189 subs r2, r2, #32
190 ldr4w r1, r8, r9, ip, lr, abort=19f
191 orr r3, r3, r4, push #\push
192 mov r4, r4, pull #\pull
193 orr r4, r4, r5, push #\push
194 mov r5, r5, pull #\pull
195 orr r5, r5, r6, push #\push
196 mov r6, r6, pull #\pull
197 orr r6, r6, r7, push #\push
198 mov r7, r7, pull #\pull
199 orr r7, r7, r8, push #\push
200 mov r8, r8, pull #\pull
201 orr r8, r8, r9, push #\push
202 mov r9, r9, pull #\pull
203 orr r9, r9, ip, push #\push
204 mov ip, ip, pull #\pull
205 orr ip, ip, lr, push #\push
206 str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
207 bge 12b
208 PLD( cmn r2, #96 )
209 PLD( bge 13b )
210
211 ldmfd sp!, {r5 - r9}
212
21314: ands ip, r2, #28
214 beq 16f
215
21615: mov r3, lr, pull #\pull
217 ldr1w r1, lr, abort=21f
218 subs ip, ip, #4
219 orr r3, r3, lr, push #\push
220 str1w r0, r3, abort=21f
221 bgt 15b
222 CALGN( cmp r2, #0 )
223 CALGN( bge 11b )
224
22516: sub r1, r1, #(\push / 8)
226 b 8b
227
228 .endm
229
230
231 forward_copy_shift pull=8 push=24
232
23317: forward_copy_shift pull=16 push=16
234
23518: forward_copy_shift pull=24 push=8
236
237
238/*
239 * Abort preanble and completion macros.
240 * If a fixup handler is required then those macros must surround it.
241 * It is assumed that the fixup code will handle the private part of
242 * the exit macro.
243 */
244
245 .macro copy_abort_preamble
24619: ldmfd sp!, {r5 - r9}
247 b 21f
24820: ldmfd sp!, {r5 - r8}
24921:
250 .endm
251
252 .macro copy_abort_end
253 ldmfd sp!, {r4, pc}
254 .endm
255
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index f5a593ceb8cc..7e71d6708a8d 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -1,393 +1,59 @@
1/* 1/*
2 * linux/arch/arm/lib/memcpy.S 2 * linux/arch/arm/lib/memcpy.S
3 * 3 *
4 * Copyright (C) 1995-1999 Russell King 4 * Author: Nicolas Pitre
5 * Created: Sep 28, 2005
6 * Copyright: MontaVista Software, Inc.
5 * 7 *
6 * This program is free software; you can redistribute it and/or modify 8 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
9 *
10 * ASM optimised string functions
11 */ 11 */
12
12#include <linux/linkage.h> 13#include <linux/linkage.h>
13#include <asm/assembler.h> 14#include <asm/assembler.h>
14 15
15 .text 16 .macro ldr1w ptr reg abort
16 17 ldr \reg, [\ptr], #4
17#define ENTER \ 18 .endm
18 mov ip,sp ;\
19 stmfd sp!,{r0,r4-r9,fp,ip,lr,pc} ;\
20 sub fp,ip,#4
21
22#define EXIT \
23 LOADREGS(ea, fp, {r0, r4 - r9, fp, sp, pc})
24
25#define EXITEQ \
26 LOADREGS(eqea, fp, {r0, r4 - r9, fp, sp, pc})
27
28/*
29 * Prototype: void memcpy(void *to,const void *from,unsigned long n);
30 */
31ENTRY(memcpy)
32ENTRY(memmove)
33 ENTER
34 cmp r1, r0
35 bcc 23f
36 subs r2, r2, #4
37 blt 6f
38 PLD( pld [r1, #0] )
39 ands ip, r0, #3
40 bne 7f
41 ands ip, r1, #3
42 bne 8f
43 19
441: subs r2, r2, #8 20 .macro ldr4w ptr reg1 reg2 reg3 reg4 abort
45 blt 5f 21 ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
46 subs r2, r2, #20 22 .endm
47 blt 4f
48 PLD( pld [r1, #28] )
49 PLD( subs r2, r2, #64 )
50 PLD( blt 3f )
512: PLD( pld [r1, #60] )
52 PLD( pld [r1, #92] )
53 ldmia r1!, {r3 - r9, ip}
54 subs r2, r2, #32
55 stmgeia r0!, {r3 - r9, ip}
56 ldmgeia r1!, {r3 - r9, ip}
57 subges r2, r2, #32
58 stmia r0!, {r3 - r9, ip}
59 bge 2b
603: PLD( ldmia r1!, {r3 - r9, ip} )
61 PLD( adds r2, r2, #32 )
62 PLD( stmgeia r0!, {r3 - r9, ip} )
63 PLD( ldmgeia r1!, {r3 - r9, ip} )
64 PLD( subges r2, r2, #32 )
65 PLD( stmia r0!, {r3 - r9, ip} )
664: cmn r2, #16
67 ldmgeia r1!, {r3 - r6}
68 subge r2, r2, #16
69 stmgeia r0!, {r3 - r6}
70 adds r2, r2, #20
71 ldmgeia r1!, {r3 - r5}
72 subge r2, r2, #12
73 stmgeia r0!, {r3 - r5}
745: adds r2, r2, #8
75 blt 6f
76 subs r2, r2, #4
77 ldrlt r3, [r1], #4
78 ldmgeia r1!, {r4, r5}
79 subge r2, r2, #4
80 strlt r3, [r0], #4
81 stmgeia r0!, {r4, r5}
82 23
836: adds r2, r2, #4 24 .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
84 EXITEQ 25 ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
85 cmp r2, #2 26 .endm
86 ldrb r3, [r1], #1
87 ldrgeb r4, [r1], #1
88 ldrgtb r5, [r1], #1
89 strb r3, [r0], #1
90 strgeb r4, [r0], #1
91 strgtb r5, [r0], #1
92 EXIT
93 27
947: rsb ip, ip, #4 28 .macro ldr1b ptr reg cond=al abort
95 cmp ip, #2 29 ldr\cond\()b \reg, [\ptr], #1
96 ldrb r3, [r1], #1 30 .endm
97 ldrgeb r4, [r1], #1
98 ldrgtb r5, [r1], #1
99 strb r3, [r0], #1
100 strgeb r4, [r0], #1
101 strgtb r5, [r0], #1
102 subs r2, r2, ip
103 blt 6b
104 ands ip, r1, #3
105 beq 1b
106 31
1078: bic r1, r1, #3 32 .macro str1w ptr reg abort
108 ldr r7, [r1], #4 33 str \reg, [\ptr], #4
109 cmp ip, #2 34 .endm
110 bgt 18f
111 beq 13f
112 cmp r2, #12
113 blt 11f
114 PLD( pld [r1, #12] )
115 sub r2, r2, #12
116 PLD( subs r2, r2, #32 )
117 PLD( blt 10f )
118 PLD( pld [r1, #28] )
1199: PLD( pld [r1, #44] )
12010: mov r3, r7, pull #8
121 ldmia r1!, {r4 - r7}
122 subs r2, r2, #16
123 orr r3, r3, r4, push #24
124 mov r4, r4, pull #8
125 orr r4, r4, r5, push #24
126 mov r5, r5, pull #8
127 orr r5, r5, r6, push #24
128 mov r6, r6, pull #8
129 orr r6, r6, r7, push #24
130 stmia r0!, {r3 - r6}
131 bge 9b
132 PLD( cmn r2, #32 )
133 PLD( bge 10b )
134 PLD( add r2, r2, #32 )
135 adds r2, r2, #12
136 blt 12f
13711: mov r3, r7, pull #8
138 ldr r7, [r1], #4
139 subs r2, r2, #4
140 orr r3, r3, r7, push #24
141 str r3, [r0], #4
142 bge 11b
14312: sub r1, r1, #3
144 b 6b
145 35
14613: cmp r2, #12 36 .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
147 blt 16f 37 stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
148 PLD( pld [r1, #12] ) 38 .endm
149 sub r2, r2, #12
150 PLD( subs r2, r2, #32 )
151 PLD( blt 15f )
152 PLD( pld [r1, #28] )
15314: PLD( pld [r1, #44] )
15415: mov r3, r7, pull #16
155 ldmia r1!, {r4 - r7}
156 subs r2, r2, #16
157 orr r3, r3, r4, push #16
158 mov r4, r4, pull #16
159 orr r4, r4, r5, push #16
160 mov r5, r5, pull #16
161 orr r5, r5, r6, push #16
162 mov r6, r6, pull #16
163 orr r6, r6, r7, push #16
164 stmia r0!, {r3 - r6}
165 bge 14b
166 PLD( cmn r2, #32 )
167 PLD( bge 15b )
168 PLD( add r2, r2, #32 )
169 adds r2, r2, #12
170 blt 17f
17116: mov r3, r7, pull #16
172 ldr r7, [r1], #4
173 subs r2, r2, #4
174 orr r3, r3, r7, push #16
175 str r3, [r0], #4
176 bge 16b
17717: sub r1, r1, #2
178 b 6b
179 39
18018: cmp r2, #12 40 .macro str1b ptr reg cond=al abort
181 blt 21f 41 str\cond\()b \reg, [\ptr], #1
182 PLD( pld [r1, #12] ) 42 .endm
183 sub r2, r2, #12
184 PLD( subs r2, r2, #32 )
185 PLD( blt 20f )
186 PLD( pld [r1, #28] )
18719: PLD( pld [r1, #44] )
18820: mov r3, r7, pull #24
189 ldmia r1!, {r4 - r7}
190 subs r2, r2, #16
191 orr r3, r3, r4, push #8
192 mov r4, r4, pull #24
193 orr r4, r4, r5, push #8
194 mov r5, r5, pull #24
195 orr r5, r5, r6, push #8
196 mov r6, r6, pull #24
197 orr r6, r6, r7, push #8
198 stmia r0!, {r3 - r6}
199 bge 19b
200 PLD( cmn r2, #32 )
201 PLD( bge 20b )
202 PLD( add r2, r2, #32 )
203 adds r2, r2, #12
204 blt 22f
20521: mov r3, r7, pull #24
206 ldr r7, [r1], #4
207 subs r2, r2, #4
208 orr r3, r3, r7, push #8
209 str r3, [r0], #4
210 bge 21b
21122: sub r1, r1, #1
212 b 6b
213 43
44 .macro enter reg1 reg2
45 stmdb sp!, {r0, \reg1, \reg2}
46 .endm
214 47
21523: add r1, r1, r2 48 .macro exit reg1 reg2
216 add r0, r0, r2 49 ldmfd sp!, {r0, \reg1, \reg2}
217 subs r2, r2, #4 50 .endm
218 blt 29f
219 PLD( pld [r1, #-4] )
220 ands ip, r0, #3
221 bne 30f
222 ands ip, r1, #3
223 bne 31f
224 51
22524: subs r2, r2, #8 52 .text
226 blt 28f
227 subs r2, r2, #20
228 blt 27f
229 PLD( pld [r1, #-32] )
230 PLD( subs r2, r2, #64 )
231 PLD( blt 26f )
23225: PLD( pld [r1, #-64] )
233 PLD( pld [r1, #-96] )
234 ldmdb r1!, {r3 - r9, ip}
235 subs r2, r2, #32
236 stmgedb r0!, {r3 - r9, ip}
237 ldmgedb r1!, {r3 - r9, ip}
238 subges r2, r2, #32
239 stmdb r0!, {r3 - r9, ip}
240 bge 25b
24126: PLD( ldmdb r1!, {r3 - r9, ip} )
242 PLD( adds r2, r2, #32 )
243 PLD( stmgedb r0!, {r3 - r9, ip} )
244 PLD( ldmgedb r1!, {r3 - r9, ip} )
245 PLD( subges r2, r2, #32 )
246 PLD( stmdb r0!, {r3 - r9, ip} )
24727: cmn r2, #16
248 ldmgedb r1!, {r3 - r6}
249 subge r2, r2, #16
250 stmgedb r0!, {r3 - r6}
251 adds r2, r2, #20
252 ldmgedb r1!, {r3 - r5}
253 subge r2, r2, #12
254 stmgedb r0!, {r3 - r5}
25528: adds r2, r2, #8
256 blt 29f
257 subs r2, r2, #4
258 ldrlt r3, [r1, #-4]!
259 ldmgedb r1!, {r4, r5}
260 subge r2, r2, #4
261 strlt r3, [r0, #-4]!
262 stmgedb r0!, {r4, r5}
263 53
26429: adds r2, r2, #4 54/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
265 EXITEQ
266 cmp r2, #2
267 ldrb r3, [r1, #-1]!
268 ldrgeb r4, [r1, #-1]!
269 ldrgtb r5, [r1, #-1]!
270 strb r3, [r0, #-1]!
271 strgeb r4, [r0, #-1]!
272 strgtb r5, [r0, #-1]!
273 EXIT
274 55
27530: cmp ip, #2 56ENTRY(memcpy)
276 ldrb r3, [r1, #-1]!
277 ldrgeb r4, [r1, #-1]!
278 ldrgtb r5, [r1, #-1]!
279 strb r3, [r0, #-1]!
280 strgeb r4, [r0, #-1]!
281 strgtb r5, [r0, #-1]!
282 subs r2, r2, ip
283 blt 29b
284 ands ip, r1, #3
285 beq 24b
286
28731: bic r1, r1, #3
288 ldr r3, [r1], #0
289 cmp ip, #2
290 blt 41f
291 beq 36f
292 cmp r2, #12
293 blt 34f
294 PLD( pld [r1, #-16] )
295 sub r2, r2, #12
296 PLD( subs r2, r2, #32 )
297 PLD( blt 33f )
298 PLD( pld [r1, #-32] )
29932: PLD( pld [r1, #-48] )
30033: mov r7, r3, push #8
301 ldmdb r1!, {r3, r4, r5, r6}
302 subs r2, r2, #16
303 orr r7, r7, r6, pull #24
304 mov r6, r6, push #8
305 orr r6, r6, r5, pull #24
306 mov r5, r5, push #8
307 orr r5, r5, r4, pull #24
308 mov r4, r4, push #8
309 orr r4, r4, r3, pull #24
310 stmdb r0!, {r4, r5, r6, r7}
311 bge 32b
312 PLD( cmn r2, #32 )
313 PLD( bge 33b )
314 PLD( add r2, r2, #32 )
315 adds r2, r2, #12
316 blt 35f
31734: mov ip, r3, push #8
318 ldr r3, [r1, #-4]!
319 subs r2, r2, #4
320 orr ip, ip, r3, pull #24
321 str ip, [r0, #-4]!
322 bge 34b
32335: add r1, r1, #3
324 b 29b
325
32636: cmp r2, #12
327 blt 39f
328 PLD( pld [r1, #-16] )
329 sub r2, r2, #12
330 PLD( subs r2, r2, #32 )
331 PLD( blt 38f )
332 PLD( pld [r1, #-32] )
33337: PLD( pld [r1, #-48] )
33438: mov r7, r3, push #16
335 ldmdb r1!, {r3, r4, r5, r6}
336 subs r2, r2, #16
337 orr r7, r7, r6, pull #16
338 mov r6, r6, push #16
339 orr r6, r6, r5, pull #16
340 mov r5, r5, push #16
341 orr r5, r5, r4, pull #16
342 mov r4, r4, push #16
343 orr r4, r4, r3, pull #16
344 stmdb r0!, {r4, r5, r6, r7}
345 bge 37b
346 PLD( cmn r2, #32 )
347 PLD( bge 38b )
348 PLD( add r2, r2, #32 )
349 adds r2, r2, #12
350 blt 40f
35139: mov ip, r3, push #16
352 ldr r3, [r1, #-4]!
353 subs r2, r2, #4
354 orr ip, ip, r3, pull #16
355 str ip, [r0, #-4]!
356 bge 39b
35740: add r1, r1, #2
358 b 29b
359 57
36041: cmp r2, #12 58#include "copy_template.S"
361 blt 44f
362 PLD( pld [r1, #-16] )
363 sub r2, r2, #12
364 PLD( subs r2, r2, #32 )
365 PLD( blt 43f )
366 PLD( pld [r1, #-32] )
36742: PLD( pld [r1, #-48] )
36843: mov r7, r3, push #24
369 ldmdb r1!, {r3, r4, r5, r6}
370 subs r2, r2, #16
371 orr r7, r7, r6, pull #8
372 mov r6, r6, push #24
373 orr r6, r6, r5, pull #8
374 mov r5, r5, push #24
375 orr r5, r5, r4, pull #8
376 mov r4, r4, push #24
377 orr r4, r4, r3, pull #8
378 stmdb r0!, {r4, r5, r6, r7}
379 bge 42b
380 PLD( cmn r2, #32 )
381 PLD( bge 43b )
382 PLD( add r2, r2, #32 )
383 adds r2, r2, #12
384 blt 45f
38544: mov ip, r3, push #24
386 ldr r3, [r1, #-4]!
387 subs r2, r2, #4
388 orr ip, ip, r3, pull #8
389 str ip, [r0, #-4]!
390 bge 44b
39145: add r1, r1, #1
392 b 29b
393 59
diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S
new file mode 100644
index 000000000000..ef7fddc14ac9
--- /dev/null
+++ b/arch/arm/lib/memmove.S
@@ -0,0 +1,206 @@
1/*
2 * linux/arch/arm/lib/memmove.S
3 *
4 * Author: Nicolas Pitre
5 * Created: Sep 28, 2005
6 * Copyright: (C) MontaVista Software Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/linkage.h>
14#include <asm/assembler.h>
15
16/*
17 * This can be used to enable code to cacheline align the source pointer.
18 * Experiments on tested architectures (StrongARM and XScale) didn't show
19 * this a worthwhile thing to do. That might be different in the future.
20 */
21//#define CALGN(code...) code
22#define CALGN(code...)
23
24 .text
25
26/*
27 * Prototype: void *memmove(void *dest, const void *src, size_t n);
28 *
29 * Note:
30 *
31 * If the memory regions don't overlap, we simply branch to memcpy which is
32 * normally a bit faster. Otherwise the copy is done going downwards. This
33 * is a transposition of the code from copy_template.S but with the copy
34 * occurring in the opposite direction.
35 */
36
37ENTRY(memmove)
38
39 subs ip, r0, r1
40 cmphi r2, ip
41 bls memcpy
42
43 stmfd sp!, {r0, r4, lr}
44 add r1, r1, r2
45 add r0, r0, r2
46 subs r2, r2, #4
47 blt 8f
48 ands ip, r0, #3
49 PLD( pld [r1, #-4] )
50 bne 9f
51 ands ip, r1, #3
52 bne 10f
53
541: subs r2, r2, #(28)
55 stmfd sp!, {r5 - r8}
56 blt 5f
57
58 CALGN( ands ip, r1, #31 )
59 CALGN( sbcnes r4, ip, r2 ) @ C is always set here
60 CALGN( bcs 2f )
61 CALGN( adr r4, 6f )
62 CALGN( subs r2, r2, ip ) @ C is set here
63 CALGN( add pc, r4, ip )
64
65 PLD( pld [r1, #-4] )
662: PLD( subs r2, r2, #96 )
67 PLD( pld [r1, #-32] )
68 PLD( blt 4f )
69 PLD( pld [r1, #-64] )
70 PLD( pld [r1, #-96] )
71
723: PLD( pld [r1, #-128] )
734: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
74 subs r2, r2, #32
75 stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
76 bge 3b
77 PLD( cmn r2, #96 )
78 PLD( bge 4b )
79
805: ands ip, r2, #28
81 rsb ip, ip, #32
82 addne pc, pc, ip @ C is always clear here
83 b 7f
846: nop
85 ldr r3, [r1, #-4]!
86 ldr r4, [r1, #-4]!
87 ldr r5, [r1, #-4]!
88 ldr r6, [r1, #-4]!
89 ldr r7, [r1, #-4]!
90 ldr r8, [r1, #-4]!
91 ldr lr, [r1, #-4]!
92
93 add pc, pc, ip
94 nop
95 nop
96 str r3, [r0, #-4]!
97 str r4, [r0, #-4]!
98 str r5, [r0, #-4]!
99 str r6, [r0, #-4]!
100 str r7, [r0, #-4]!
101 str r8, [r0, #-4]!
102 str lr, [r0, #-4]!
103
104 CALGN( bcs 2b )
105
1067: ldmfd sp!, {r5 - r8}
107
1088: movs r2, r2, lsl #31
109 ldrneb r3, [r1, #-1]!
110 ldrcsb r4, [r1, #-1]!
111 ldrcsb ip, [r1, #-1]
112 strneb r3, [r0, #-1]!
113 strcsb r4, [r0, #-1]!
114 strcsb ip, [r0, #-1]
115 ldmfd sp!, {r0, r4, pc}
116
1179: cmp ip, #2
118 ldrgtb r3, [r1, #-1]!
119 ldrgeb r4, [r1, #-1]!
120 ldrb lr, [r1, #-1]!
121 strgtb r3, [r0, #-1]!
122 strgeb r4, [r0, #-1]!
123 subs r2, r2, ip
124 strb lr, [r0, #-1]!
125 blt 8b
126 ands ip, r1, #3
127 beq 1b
128
12910: bic r1, r1, #3
130 cmp ip, #2
131 ldr r3, [r1, #0]
132 beq 17f
133 blt 18f
134
135
136 .macro backward_copy_shift push pull
137
138 subs r2, r2, #28
139 blt 14f
140
141 CALGN( ands ip, r1, #31 )
142 CALGN( rsb ip, ip, #32 )
143 CALGN( sbcnes r4, ip, r2 ) @ C is always set here
144 CALGN( subcc r2, r2, ip )
145 CALGN( bcc 15f )
146
14711: stmfd sp!, {r5 - r9}
148
149 PLD( pld [r1, #-4] )
150 PLD( subs r2, r2, #96 )
151 PLD( pld [r1, #-32] )
152 PLD( blt 13f )
153 PLD( pld [r1, #-64] )
154 PLD( pld [r1, #-96] )
155
15612: PLD( pld [r1, #-128] )
15713: ldmdb r1!, {r7, r8, r9, ip}
158 mov lr, r3, push #\push
159 subs r2, r2, #32
160 ldmdb r1!, {r3, r4, r5, r6}
161 orr lr, lr, ip, pull #\pull
162 mov ip, ip, push #\push
163 orr ip, ip, r9, pull #\pull
164 mov r9, r9, push #\push
165 orr r9, r9, r8, pull #\pull
166 mov r8, r8, push #\push
167 orr r8, r8, r7, pull #\pull
168 mov r7, r7, push #\push
169 orr r7, r7, r6, pull #\pull
170 mov r6, r6, push #\push
171 orr r6, r6, r5, pull #\pull
172 mov r5, r5, push #\push
173 orr r5, r5, r4, pull #\pull
174 mov r4, r4, push #\push
175 orr r4, r4, r3, pull #\pull
176 stmdb r0!, {r4 - r9, ip, lr}
177 bge 12b
178 PLD( cmn r2, #96 )
179 PLD( bge 13b )
180
181 ldmfd sp!, {r5 - r9}
182
18314: ands ip, r2, #28
184 beq 16f
185
18615: mov lr, r3, push #\push
187 ldr r3, [r1, #-4]!
188 subs ip, ip, #4
189 orr lr, lr, r3, pull #\pull
190 str lr, [r0, #-4]!
191 bgt 15b
192 CALGN( cmp r2, #0 )
193 CALGN( bge 11b )
194
19516: add r1, r1, #(\pull / 8)
196 b 8b
197
198 .endm
199
200
201 backward_copy_shift push=8 pull=24
202
20317: backward_copy_shift push=16 pull=16
204
20518: backward_copy_shift push=24 pull=8
206