aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/lib/copy_template.S
diff options
context:
space:
mode:
authorNicolas Pitre <nico@cam.org>2005-11-01 14:52:23 -0500
committerRussell King <rmk+kernel@arm.linux.org.uk>2005-11-01 14:52:23 -0500
commit7549423000fc38d39a8b81c601dea0332c113a42 (patch)
tree6b76fe2867b9634a1d1dbaf682c69ccad4e9f71b /arch/arm/lib/copy_template.S
parenta0c6fdb987860e6c7f9b8e57439ca2703f462578 (diff)
[ARM] 2947/1: copy template with new memcpy/memmove
Patch from Nicolas Pitre This patch provides a new implementation for optimized memory copy functions on ARM. It is made of two levels: a template that consists of the core copy code and separate files that define macros to be used with the core code depending on the type of copy needed. This allows for best performances while sharing the same core for implementing memcpy(), copy_from_user() and copy_to_user() for instance. Two reasons for this work: 1) the current copy_to_user/copy_from_user implementation assumes no task switch will ever occur in the middle of each copied page making it completely unsafe with CONFIG_PREEMPT=y. 2) current copy implementations are measurably suboptimal and optimizing different implementations separately is a pain and more opportunities for bugs. The reason for (1) is the fact that copy inside user pages are performed with the ldm instruction which has no mean for testing user protections and could possibly race with process preemption bypassing the COW mechanism for example. This is a longstanding issue that we said ought to be fixed for about two years now. The solution is to substitute those ldm insns with a series of ldrt or strt insns to enforce user memory protection. At least on StrongARM and XScale cores the ldm is not faster than the equivalent ldr/str insns with a warm i-cache so there is no measurable performance degradation with that change. The fact that the copy code is a template makes it pretty easy to reuse the same core code as for memcpy and benefit from the same performance optimizations. Now (2) is best demonstrated with actual throughput measurements. First, here is a summary of memcopy tests performed on a StrongARM core: PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 59.73 107.43 unaligned 32 61.31 74.72 aligned 100 132.47 136.15 unaligned 100 103.84 123.76 aligned 4096 130.67 130.80 unaligned 4096 130.68 130.64 aligned 1048576 68.03 68.18 unaligned 1048576 68.03 68.18 The buffer size is in bytes and the measured speed in MB/s. The copy was performed repeatedly with given buffer and throughput averaged over 3 seconds. Here we can see that the current kernel version has a higher entry cost that shows up with small buffers. As buffer size grows both implementation converge to the same throughput. Now here's the exact same test performed on an XScale core (PXA255): PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 46.99 77.58 unaligned 32 53.61 59.59 aligned 100 107.19 136.59 unaligned 100 83.61 97.58 aligned 4096 129.13 129.98 unaligned 4096 128.36 128.53 aligned 1048576 53.76 59.41 unaligned 1048576 33.67 56.96 Again we can see the entry setup cost being higher for the current kernel before getting to the main copy loop. Then throughput results converge as long as the buffer remains in the cache. Then the 1MB case shows more differences probably due to better pld placement and/or less instruction interlocks in this proposed implementation. Disclaimer: The PXA system was running with slower clocks than the StrongARM system so trying to infer any conclusion by comparing those separate sets of results side by side would be completely inappropriate. So... What this patch does is to replace both memcpy and memmove with an implementation based on the provided copy code template. The memmove code is kept separate since it is used only if the memory areas involved do overlap in which case the code is a transposition of the template but with the copy occurring in the opposite direction (trying to fit that mode into the template turned it into a mess not worth it for memmove alone). And obviously both memcpy and memmove were tested with all kinds of pointer alignments and buffer sizes to exercise all code paths for correctness. The next patch will provide the now trivial replacement implementation copy_to_user and copy_from_user. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Diffstat (limited to 'arch/arm/lib/copy_template.S')
-rw-r--r--arch/arm/lib/copy_template.S255
1 files changed, 255 insertions, 0 deletions
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
new file mode 100644
index 000000000000..838e435e4922
--- /dev/null
+++ b/arch/arm/lib/copy_template.S
@@ -0,0 +1,255 @@
1/*
2 * linux/arch/arm/lib/copy_template.s
3 *
4 * Code template for optimized memory copy functions
5 *
6 * Author: Nicolas Pitre
7 * Created: Sep 28, 2005
8 * Copyright: MontaVista Software, Inc.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15/*
16 * This can be used to enable code to cacheline align the source pointer.
17 * Experiments on tested architectures (StrongARM and XScale) didn't show
18 * this a worthwhile thing to do. That might be different in the future.
19 */
20//#define CALGN(code...) code
21#define CALGN(code...)
22
23/*
24 * Theory of operation
25 * -------------------
26 *
27 * This file provides the core code for a forward memory copy used in
28 * the implementation of memcopy(), copy_to_user() and copy_from_user().
29 *
30 * The including file must define the following accessor macros
31 * according to the need of the given function:
32 *
33 * ldr1w ptr reg abort
34 *
35 * This loads one word from 'ptr', stores it in 'reg' and increments
36 * 'ptr' to the next word. The 'abort' argument is used for fixup tables.
37 *
38 * ldr4w ptr reg1 reg2 reg3 reg4 abort
39 * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
40 *
41 * This loads four or eight words starting from 'ptr', stores them
42 * in provided registers and increments 'ptr' past those words.
43 * The'abort' argument is used for fixup tables.
44 *
45 * ldr1b ptr reg cond abort
46 *
47 * Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
48 * It also must apply the condition code if provided, otherwise the
49 * "al" condition is assumed by default.
50 *
51 * str1w ptr reg abort
52 * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
53 * str1b ptr reg cond abort
54 *
55 * Same as their ldr* counterparts, but data is stored to 'ptr' location
56 * rather than being loaded.
57 *
58 * enter reg1 reg2
59 *
60 * Preserve the provided registers on the stack plus any additional
61 * data as needed by the implementation including this code. Called
62 * upon code entry.
63 *
64 * exit reg1 reg2
65 *
66 * Restore registers with the values previously saved with the
67 * 'preserv' macro. Called upon code termination.
68 */
69
70
71 enter r4, lr
72
73 subs r2, r2, #4
74 blt 8f
75 ands ip, r0, #3
76 PLD( pld [r1, #0] )
77 bne 9f
78 ands ip, r1, #3
79 bne 10f
80
811: subs r2, r2, #(28)
82 stmfd sp!, {r5 - r8}
83 blt 5f
84
85 CALGN( ands ip, r1, #31 )
86 CALGN( rsb r3, ip, #32 )
87 CALGN( sbcnes r4, r3, r2 ) @ C is always set here
88 CALGN( bcs 2f )
89 CALGN( adr r4, 6f )
90 CALGN( subs r2, r2, r3 ) @ C gets set
91 CALGN( add pc, r4, ip )
92
93 PLD( pld [r1, #0] )
942: PLD( subs r2, r2, #96 )
95 PLD( pld [r1, #28] )
96 PLD( blt 4f )
97 PLD( pld [r1, #60] )
98 PLD( pld [r1, #92] )
99
1003: PLD( pld [r1, #124] )
1014: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
102 subs r2, r2, #32
103 str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
104 bge 3b
105 PLD( cmn r2, #96 )
106 PLD( bge 4b )
107
1085: ands ip, r2, #28
109 rsb ip, ip, #32
110 addne pc, pc, ip @ C is always clear here
111 b 7f
1126: nop
113 ldr1w r1, r3, abort=20f
114 ldr1w r1, r4, abort=20f
115 ldr1w r1, r5, abort=20f
116 ldr1w r1, r6, abort=20f
117 ldr1w r1, r7, abort=20f
118 ldr1w r1, r8, abort=20f
119 ldr1w r1, lr, abort=20f
120
121 add pc, pc, ip
122 nop
123 nop
124 str1w r0, r3, abort=20f
125 str1w r0, r4, abort=20f
126 str1w r0, r5, abort=20f
127 str1w r0, r6, abort=20f
128 str1w r0, r7, abort=20f
129 str1w r0, r8, abort=20f
130 str1w r0, lr, abort=20f
131
132 CALGN( bcs 2b )
133
1347: ldmfd sp!, {r5 - r8}
135
1368: movs r2, r2, lsl #31
137 ldr1b r1, r3, ne, abort=21f
138 ldr1b r1, r4, cs, abort=21f
139 ldr1b r1, ip, cs, abort=21f
140 str1b r0, r3, ne, abort=21f
141 str1b r0, r4, cs, abort=21f
142 str1b r0, ip, cs, abort=21f
143
144 exit r4, pc
145
1469: rsb ip, ip, #4
147 cmp ip, #2
148 ldr1b r1, r3, gt, abort=21f
149 ldr1b r1, r4, ge, abort=21f
150 ldr1b r1, lr, abort=21f
151 str1b r0, r3, gt, abort=21f
152 str1b r0, r4, ge, abort=21f
153 subs r2, r2, ip
154 str1b r0, lr, abort=21f
155 blt 8b
156 ands ip, r1, #3
157 beq 1b
158
15910: bic r1, r1, #3
160 cmp ip, #2
161 ldr1w r1, lr, abort=21f
162 beq 17f
163 bgt 18f
164
165
166 .macro forward_copy_shift pull push
167
168 subs r2, r2, #28
169 blt 14f
170
171 CALGN( ands ip, r1, #31 )
172 CALGN( rsb ip, ip, #32 )
173 CALGN( sbcnes r4, ip, r2 ) @ C is always set here
174 CALGN( subcc r2, r2, ip )
175 CALGN( bcc 15f )
176
17711: stmfd sp!, {r5 - r9}
178
179 PLD( pld [r1, #0] )
180 PLD( subs r2, r2, #96 )
181 PLD( pld [r1, #28] )
182 PLD( blt 13f )
183 PLD( pld [r1, #60] )
184 PLD( pld [r1, #92] )
185
18612: PLD( pld [r1, #124] )
18713: ldr4w r1, r4, r5, r6, r7, abort=19f
188 mov r3, lr, pull #\pull
189 subs r2, r2, #32
190 ldr4w r1, r8, r9, ip, lr, abort=19f
191 orr r3, r3, r4, push #\push
192 mov r4, r4, pull #\pull
193 orr r4, r4, r5, push #\push
194 mov r5, r5, pull #\pull
195 orr r5, r5, r6, push #\push
196 mov r6, r6, pull #\pull
197 orr r6, r6, r7, push #\push
198 mov r7, r7, pull #\pull
199 orr r7, r7, r8, push #\push
200 mov r8, r8, pull #\pull
201 orr r8, r8, r9, push #\push
202 mov r9, r9, pull #\pull
203 orr r9, r9, ip, push #\push
204 mov ip, ip, pull #\pull
205 orr ip, ip, lr, push #\push
206 str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
207 bge 12b
208 PLD( cmn r2, #96 )
209 PLD( bge 13b )
210
211 ldmfd sp!, {r5 - r9}
212
21314: ands ip, r2, #28
214 beq 16f
215
21615: mov r3, lr, pull #\pull
217 ldr1w r1, lr, abort=21f
218 subs ip, ip, #4
219 orr r3, r3, lr, push #\push
220 str1w r0, r3, abort=21f
221 bgt 15b
222 CALGN( cmp r2, #0 )
223 CALGN( bge 11b )
224
22516: sub r1, r1, #(\push / 8)
226 b 8b
227
228 .endm
229
230
231 forward_copy_shift pull=8 push=24
232
23317: forward_copy_shift pull=16 push=16
234
23518: forward_copy_shift pull=24 push=8
236
237
238/*
239 * Abort preanble and completion macros.
240 * If a fixup handler is required then those macros must surround it.
241 * It is assumed that the fixup code will handle the private part of
242 * the exit macro.
243 */
244
245 .macro copy_abort_preamble
24619: ldmfd sp!, {r5 - r9}
247 b 21f
24820: ldmfd sp!, {r5 - r8}
24921:
250 .endm
251
252 .macro copy_abort_end
253 ldmfd sp!, {r4, pc}
254 .endm
255