aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm/lib/memmove.S
diff options
context:
space:
mode:
authorNicolas Pitre <nico@cam.org>2005-11-01 14:52:23 -0500
committerRussell King <rmk+kernel@arm.linux.org.uk>2005-11-01 14:52:23 -0500
commit7549423000fc38d39a8b81c601dea0332c113a42 (patch)
tree6b76fe2867b9634a1d1dbaf682c69ccad4e9f71b /arch/arm/lib/memmove.S
parenta0c6fdb987860e6c7f9b8e57439ca2703f462578 (diff)
[ARM] 2947/1: copy template with new memcpy/memmove
Patch from Nicolas Pitre This patch provides a new implementation for optimized memory copy functions on ARM. It is made of two levels: a template that consists of the core copy code and separate files that define macros to be used with the core code depending on the type of copy needed. This allows for best performances while sharing the same core for implementing memcpy(), copy_from_user() and copy_to_user() for instance. Two reasons for this work: 1) the current copy_to_user/copy_from_user implementation assumes no task switch will ever occur in the middle of each copied page making it completely unsafe with CONFIG_PREEMPT=y. 2) current copy implementations are measurably suboptimal and optimizing different implementations separately is a pain and more opportunities for bugs. The reason for (1) is the fact that copy inside user pages are performed with the ldm instruction which has no mean for testing user protections and could possibly race with process preemption bypassing the COW mechanism for example. This is a longstanding issue that we said ought to be fixed for about two years now. The solution is to substitute those ldm insns with a series of ldrt or strt insns to enforce user memory protection. At least on StrongARM and XScale cores the ldm is not faster than the equivalent ldr/str insns with a warm i-cache so there is no measurable performance degradation with that change. The fact that the copy code is a template makes it pretty easy to reuse the same core code as for memcpy and benefit from the same performance optimizations. Now (2) is best demonstrated with actual throughput measurements. First, here is a summary of memcopy tests performed on a StrongARM core: PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 59.73 107.43 unaligned 32 61.31 74.72 aligned 100 132.47 136.15 unaligned 100 103.84 123.76 aligned 4096 130.67 130.80 unaligned 4096 130.68 130.64 aligned 1048576 68.03 68.18 unaligned 1048576 68.03 68.18 The buffer size is in bytes and the measured speed in MB/s. The copy was performed repeatedly with given buffer and throughput averaged over 3 seconds. Here we can see that the current kernel version has a higher entry cost that shows up with small buffers. As buffer size grows both implementation converge to the same throughput. Now here's the exact same test performed on an XScale core (PXA255): PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 46.99 77.58 unaligned 32 53.61 59.59 aligned 100 107.19 136.59 unaligned 100 83.61 97.58 aligned 4096 129.13 129.98 unaligned 4096 128.36 128.53 aligned 1048576 53.76 59.41 unaligned 1048576 33.67 56.96 Again we can see the entry setup cost being higher for the current kernel before getting to the main copy loop. Then throughput results converge as long as the buffer remains in the cache. Then the 1MB case shows more differences probably due to better pld placement and/or less instruction interlocks in this proposed implementation. Disclaimer: The PXA system was running with slower clocks than the StrongARM system so trying to infer any conclusion by comparing those separate sets of results side by side would be completely inappropriate. So... What this patch does is to replace both memcpy and memmove with an implementation based on the provided copy code template. The memmove code is kept separate since it is used only if the memory areas involved do overlap in which case the code is a transposition of the template but with the copy occurring in the opposite direction (trying to fit that mode into the template turned it into a mess not worth it for memmove alone). And obviously both memcpy and memmove were tested with all kinds of pointer alignments and buffer sizes to exercise all code paths for correctness. The next patch will provide the now trivial replacement implementation copy_to_user and copy_from_user. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
Diffstat (limited to 'arch/arm/lib/memmove.S')
-rw-r--r--arch/arm/lib/memmove.S206
1 files changed, 206 insertions, 0 deletions
diff --git a/arch/arm/lib/memmove.S b/arch/arm/lib/memmove.S
new file mode 100644
index 000000000000..ef7fddc14ac9
--- /dev/null
+++ b/arch/arm/lib/memmove.S
@@ -0,0 +1,206 @@
1/*
2 * linux/arch/arm/lib/memmove.S
3 *
4 * Author: Nicolas Pitre
5 * Created: Sep 28, 2005
6 * Copyright: (C) MontaVista Software Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 */
12
13#include <linux/linkage.h>
14#include <asm/assembler.h>
15
16/*
17 * This can be used to enable code to cacheline align the source pointer.
18 * Experiments on tested architectures (StrongARM and XScale) didn't show
19 * this a worthwhile thing to do. That might be different in the future.
20 */
21//#define CALGN(code...) code
22#define CALGN(code...)
23
24 .text
25
26/*
27 * Prototype: void *memmove(void *dest, const void *src, size_t n);
28 *
29 * Note:
30 *
31 * If the memory regions don't overlap, we simply branch to memcpy which is
32 * normally a bit faster. Otherwise the copy is done going downwards. This
33 * is a transposition of the code from copy_template.S but with the copy
34 * occurring in the opposite direction.
35 */
36
37ENTRY(memmove)
38
39 subs ip, r0, r1
40 cmphi r2, ip
41 bls memcpy
42
43 stmfd sp!, {r0, r4, lr}
44 add r1, r1, r2
45 add r0, r0, r2
46 subs r2, r2, #4
47 blt 8f
48 ands ip, r0, #3
49 PLD( pld [r1, #-4] )
50 bne 9f
51 ands ip, r1, #3
52 bne 10f
53
541: subs r2, r2, #(28)
55 stmfd sp!, {r5 - r8}
56 blt 5f
57
58 CALGN( ands ip, r1, #31 )
59 CALGN( sbcnes r4, ip, r2 ) @ C is always set here
60 CALGN( bcs 2f )
61 CALGN( adr r4, 6f )
62 CALGN( subs r2, r2, ip ) @ C is set here
63 CALGN( add pc, r4, ip )
64
65 PLD( pld [r1, #-4] )
662: PLD( subs r2, r2, #96 )
67 PLD( pld [r1, #-32] )
68 PLD( blt 4f )
69 PLD( pld [r1, #-64] )
70 PLD( pld [r1, #-96] )
71
723: PLD( pld [r1, #-128] )
734: ldmdb r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
74 subs r2, r2, #32
75 stmdb r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
76 bge 3b
77 PLD( cmn r2, #96 )
78 PLD( bge 4b )
79
805: ands ip, r2, #28
81 rsb ip, ip, #32
82 addne pc, pc, ip @ C is always clear here
83 b 7f
846: nop
85 ldr r3, [r1, #-4]!
86 ldr r4, [r1, #-4]!
87 ldr r5, [r1, #-4]!
88 ldr r6, [r1, #-4]!
89 ldr r7, [r1, #-4]!
90 ldr r8, [r1, #-4]!
91 ldr lr, [r1, #-4]!
92
93 add pc, pc, ip
94 nop
95 nop
96 str r3, [r0, #-4]!
97 str r4, [r0, #-4]!
98 str r5, [r0, #-4]!
99 str r6, [r0, #-4]!
100 str r7, [r0, #-4]!
101 str r8, [r0, #-4]!
102 str lr, [r0, #-4]!
103
104 CALGN( bcs 2b )
105
1067: ldmfd sp!, {r5 - r8}
107
1088: movs r2, r2, lsl #31
109 ldrneb r3, [r1, #-1]!
110 ldrcsb r4, [r1, #-1]!
111 ldrcsb ip, [r1, #-1]
112 strneb r3, [r0, #-1]!
113 strcsb r4, [r0, #-1]!
114 strcsb ip, [r0, #-1]
115 ldmfd sp!, {r0, r4, pc}
116
1179: cmp ip, #2
118 ldrgtb r3, [r1, #-1]!
119 ldrgeb r4, [r1, #-1]!
120 ldrb lr, [r1, #-1]!
121 strgtb r3, [r0, #-1]!
122 strgeb r4, [r0, #-1]!
123 subs r2, r2, ip
124 strb lr, [r0, #-1]!
125 blt 8b
126 ands ip, r1, #3
127 beq 1b
128
12910: bic r1, r1, #3
130 cmp ip, #2
131 ldr r3, [r1, #0]
132 beq 17f
133 blt 18f
134
135
136 .macro backward_copy_shift push pull
137
138 subs r2, r2, #28
139 blt 14f
140
141 CALGN( ands ip, r1, #31 )
142 CALGN( rsb ip, ip, #32 )
143 CALGN( sbcnes r4, ip, r2 ) @ C is always set here
144 CALGN( subcc r2, r2, ip )
145 CALGN( bcc 15f )
146
14711: stmfd sp!, {r5 - r9}
148
149 PLD( pld [r1, #-4] )
150 PLD( subs r2, r2, #96 )
151 PLD( pld [r1, #-32] )
152 PLD( blt 13f )
153 PLD( pld [r1, #-64] )
154 PLD( pld [r1, #-96] )
155
15612: PLD( pld [r1, #-128] )
15713: ldmdb r1!, {r7, r8, r9, ip}
158 mov lr, r3, push #\push
159 subs r2, r2, #32
160 ldmdb r1!, {r3, r4, r5, r6}
161 orr lr, lr, ip, pull #\pull
162 mov ip, ip, push #\push
163 orr ip, ip, r9, pull #\pull
164 mov r9, r9, push #\push
165 orr r9, r9, r8, pull #\pull
166 mov r8, r8, push #\push
167 orr r8, r8, r7, pull #\pull
168 mov r7, r7, push #\push
169 orr r7, r7, r6, pull #\pull
170 mov r6, r6, push #\push
171 orr r6, r6, r5, pull #\pull
172 mov r5, r5, push #\push
173 orr r5, r5, r4, pull #\pull
174 mov r4, r4, push #\push
175 orr r4, r4, r3, pull #\pull
176 stmdb r0!, {r4 - r9, ip, lr}
177 bge 12b
178 PLD( cmn r2, #96 )
179 PLD( bge 13b )
180
181 ldmfd sp!, {r5 - r9}
182
18314: ands ip, r2, #28
184 beq 16f
185
18615: mov lr, r3, push #\push
187 ldr r3, [r1, #-4]!
188 subs ip, ip, #4
189 orr lr, lr, r3, pull #\pull
190 str lr, [r0, #-4]!
191 bgt 15b
192 CALGN( cmp r2, #0 )
193 CALGN( bge 11b )
194
19516: add r1, r1, #(\pull / 8)
196 b 8b
197
198 .endm
199
200
201 backward_copy_shift push=8 pull=24
202
20317: backward_copy_shift push=16 pull=16
204
20518: backward_copy_shift push=24 pull=8
206