diff options
author | Chris Zankel <czankel@tensilica.com> | 2005-06-24 01:01:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-24 03:05:21 -0400 |
commit | 249ac17e96811acc3c6402317dd5d5c89d2cbf68 (patch) | |
tree | 0a174065460de196861b85f1d9a48c88b2a2675a /arch/xtensa/lib/memcopy.S | |
parent | 5a0015d62668e64c8b6e02e360fbbea121bfd5e6 (diff) |
[PATCH] xtensa: Architecture support for Tensilica Xtensa Part 4
The attached patches provides part 4 of an architecture implementation for the
Tensilica Xtensa CPU series.
Signed-off-by: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/xtensa/lib/memcopy.S')
-rw-r--r-- | arch/xtensa/lib/memcopy.S | 315 |
1 files changed, 315 insertions, 0 deletions
diff --git a/arch/xtensa/lib/memcopy.S b/arch/xtensa/lib/memcopy.S new file mode 100644 index 000000000000..e8f6d7eb7222 --- /dev/null +++ b/arch/xtensa/lib/memcopy.S | |||
@@ -0,0 +1,315 @@ | |||
1 | /* | ||
2 | * arch/xtensa/lib/hal/memcopy.S -- Core HAL library functions | ||
3 | * xthal_memcpy and xthal_bcopy | ||
4 | * | ||
5 | * This file is subject to the terms and conditions of the GNU General Public | ||
6 | * License. See the file "COPYING" in the main directory of this archive | ||
7 | * for more details. | ||
8 | * | ||
9 | * Copyright (C) 2002 - 2005 Tensilica Inc. | ||
10 | */ | ||
11 | |||
12 | #include <xtensa/coreasm.h> | ||
13 | |||
14 | .macro src_b r, w0, w1 | ||
15 | #ifdef __XTENSA_EB__ | ||
16 | src \r, \w0, \w1 | ||
17 | #else | ||
18 | src \r, \w1, \w0 | ||
19 | #endif | ||
20 | .endm | ||
21 | |||
22 | .macro ssa8 r | ||
23 | #ifdef __XTENSA_EB__ | ||
24 | ssa8b \r | ||
25 | #else | ||
26 | ssa8l \r | ||
27 | #endif | ||
28 | .endm | ||
29 | |||
30 | |||
31 | /* | ||
32 | * void *memcpy(void *dst, const void *src, size_t len); | ||
33 | * void *memmove(void *dst, const void *src, size_t len); | ||
34 | * void *bcopy(const void *src, void *dst, size_t len); | ||
35 | * | ||
36 | * This function is intended to do the same thing as the standard | ||
37 | * library function memcpy() (or bcopy()) for most cases. | ||
38 | * However, where the source and/or destination references | ||
39 | * an instruction RAM or ROM or a data RAM or ROM, that | ||
40 | * source and/or destination will always be accessed with | ||
41 | * 32-bit load and store instructions (as required for these | ||
42 | * types of devices). | ||
43 | * | ||
44 | * !!!!!!! XTFIXME: | ||
45 | * !!!!!!! Handling of IRAM/IROM has not yet | ||
46 | * !!!!!!! been implemented. | ||
47 | * | ||
48 | * The bcopy version is provided here to avoid the overhead | ||
49 | * of an extra call, for callers that require this convention. | ||
50 | * | ||
51 | * The (general case) algorithm is as follows: | ||
52 | * If destination is unaligned, align it by conditionally | ||
53 | * copying 1 and 2 bytes. | ||
54 | * If source is aligned, | ||
55 | * do 16 bytes with a loop, and then finish up with | ||
56 | * 8, 4, 2, and 1 byte copies conditional on the length; | ||
57 | * else (if source is unaligned), | ||
58 | * do the same, but use SRC to align the source data. | ||
59 | * This code tries to use fall-through branches for the common | ||
60 | * case of aligned source and destination and multiple | ||
61 | * of 4 (or 8) length. | ||
62 | * | ||
63 | * Register use: | ||
64 | * a0/ return address | ||
65 | * a1/ stack pointer | ||
66 | * a2/ return value | ||
67 | * a3/ src | ||
68 | * a4/ length | ||
69 | * a5/ dst | ||
70 | * a6/ tmp | ||
71 | * a7/ tmp | ||
72 | * a8/ tmp | ||
73 | * a9/ tmp | ||
74 | * a10/ tmp | ||
75 | * a11/ tmp | ||
76 | */ | ||
77 | |||
78 | .text | ||
79 | .align 4 | ||
80 | .global bcopy | ||
81 | .type bcopy,@function | ||
82 | bcopy: | ||
83 | entry sp, 16 # minimal stack frame | ||
84 | # a2=src, a3=dst, a4=len | ||
85 | mov a5, a3 # copy dst so that a2 is return value | ||
86 | mov a3, a2 | ||
87 | mov a2, a5 | ||
88 | j .Lcommon # go to common code for memcpy+bcopy | ||
89 | |||
90 | |||
91 | /* | ||
92 | * Byte by byte copy | ||
93 | */ | ||
94 | .align 4 | ||
95 | .byte 0 # 1 mod 4 alignment for LOOPNEZ | ||
96 | # (0 mod 4 alignment for LBEG) | ||
97 | .Lbytecopy: | ||
98 | #if XCHAL_HAVE_LOOPS | ||
99 | loopnez a4, .Lbytecopydone | ||
100 | #else /* !XCHAL_HAVE_LOOPS */ | ||
101 | beqz a4, .Lbytecopydone | ||
102 | add a7, a3, a4 # a7 = end address for source | ||
103 | #endif /* !XCHAL_HAVE_LOOPS */ | ||
104 | .Lnextbyte: | ||
105 | l8ui a6, a3, 0 | ||
106 | addi a3, a3, 1 | ||
107 | s8i a6, a5, 0 | ||
108 | addi a5, a5, 1 | ||
109 | #if !XCHAL_HAVE_LOOPS | ||
110 | blt a3, a7, .Lnextbyte | ||
111 | #endif /* !XCHAL_HAVE_LOOPS */ | ||
112 | .Lbytecopydone: | ||
113 | retw | ||
114 | |||
115 | /* | ||
116 | * Destination is unaligned | ||
117 | */ | ||
118 | |||
119 | .align 4 | ||
120 | .Ldst1mod2: # dst is only byte aligned | ||
121 | _bltui a4, 7, .Lbytecopy # do short copies byte by byte | ||
122 | |||
123 | # copy 1 byte | ||
124 | l8ui a6, a3, 0 | ||
125 | addi a3, a3, 1 | ||
126 | addi a4, a4, -1 | ||
127 | s8i a6, a5, 0 | ||
128 | addi a5, a5, 1 | ||
129 | _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then | ||
130 | # return to main algorithm | ||
131 | .Ldst2mod4: # dst 16-bit aligned | ||
132 | # copy 2 bytes | ||
133 | _bltui a4, 6, .Lbytecopy # do short copies byte by byte | ||
134 | l8ui a6, a3, 0 | ||
135 | l8ui a7, a3, 1 | ||
136 | addi a3, a3, 2 | ||
137 | addi a4, a4, -2 | ||
138 | s8i a6, a5, 0 | ||
139 | s8i a7, a5, 1 | ||
140 | addi a5, a5, 2 | ||
141 | j .Ldstaligned # dst is now aligned, return to main algorithm | ||
142 | |||
143 | .align 4 | ||
144 | .global memcpy | ||
145 | .type memcpy,@function | ||
146 | memcpy: | ||
147 | .global memmove | ||
148 | .type memmove,@function | ||
149 | memmove: | ||
150 | |||
151 | entry sp, 16 # minimal stack frame | ||
152 | # a2/ dst, a3/ src, a4/ len | ||
153 | mov a5, a2 # copy dst so that a2 is return value | ||
154 | .Lcommon: | ||
155 | _bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 | ||
156 | _bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 | ||
157 | .Ldstaligned: # return here from .Ldst?mod? once dst is aligned | ||
158 | srli a7, a4, 4 # number of loop iterations with 16B | ||
159 | # per iteration | ||
160 | movi a8, 3 # if source is not aligned, | ||
161 | _bany a3, a8, .Lsrcunaligned # then use shifting copy | ||
162 | /* | ||
163 | * Destination and source are word-aligned, use word copy. | ||
164 | */ | ||
165 | # copy 16 bytes per iteration for word-aligned dst and word-aligned src | ||
166 | #if XCHAL_HAVE_LOOPS | ||
167 | loopnez a7, .Loop1done | ||
168 | #else /* !XCHAL_HAVE_LOOPS */ | ||
169 | beqz a7, .Loop1done | ||
170 | slli a8, a7, 4 | ||
171 | add a8, a8, a3 # a8 = end of last 16B source chunk | ||
172 | #endif /* !XCHAL_HAVE_LOOPS */ | ||
173 | .Loop1: | ||
174 | l32i a6, a3, 0 | ||
175 | l32i a7, a3, 4 | ||
176 | s32i a6, a5, 0 | ||
177 | l32i a6, a3, 8 | ||
178 | s32i a7, a5, 4 | ||
179 | l32i a7, a3, 12 | ||
180 | s32i a6, a5, 8 | ||
181 | addi a3, a3, 16 | ||
182 | s32i a7, a5, 12 | ||
183 | addi a5, a5, 16 | ||
184 | #if !XCHAL_HAVE_LOOPS | ||
185 | blt a3, a8, .Loop1 | ||
186 | #endif /* !XCHAL_HAVE_LOOPS */ | ||
187 | .Loop1done: | ||
188 | bbci.l a4, 3, .L2 | ||
189 | # copy 8 bytes | ||
190 | l32i a6, a3, 0 | ||
191 | l32i a7, a3, 4 | ||
192 | addi a3, a3, 8 | ||
193 | s32i a6, a5, 0 | ||
194 | s32i a7, a5, 4 | ||
195 | addi a5, a5, 8 | ||
196 | .L2: | ||
197 | bbsi.l a4, 2, .L3 | ||
198 | bbsi.l a4, 1, .L4 | ||
199 | bbsi.l a4, 0, .L5 | ||
200 | retw | ||
201 | .L3: | ||
202 | # copy 4 bytes | ||
203 | l32i a6, a3, 0 | ||
204 | addi a3, a3, 4 | ||
205 | s32i a6, a5, 0 | ||
206 | addi a5, a5, 4 | ||
207 | bbsi.l a4, 1, .L4 | ||
208 | bbsi.l a4, 0, .L5 | ||
209 | retw | ||
210 | .L4: | ||
211 | # copy 2 bytes | ||
212 | l16ui a6, a3, 0 | ||
213 | addi a3, a3, 2 | ||
214 | s16i a6, a5, 0 | ||
215 | addi a5, a5, 2 | ||
216 | bbsi.l a4, 0, .L5 | ||
217 | retw | ||
218 | .L5: | ||
219 | # copy 1 byte | ||
220 | l8ui a6, a3, 0 | ||
221 | s8i a6, a5, 0 | ||
222 | retw | ||
223 | |||
224 | /* | ||
225 | * Destination is aligned, Source is unaligned | ||
226 | */ | ||
227 | |||
228 | .align 4 | ||
229 | .Lsrcunaligned: | ||
230 | _beqz a4, .Ldone # avoid loading anything for zero-length copies | ||
231 | # copy 16 bytes per iteration for word-aligned dst and unaligned src | ||
232 | ssa8 a3 # set shift amount from byte offset | ||
233 | #define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS (simulator) with the | ||
234 | lint or ferret client, or 0 to save a few cycles */ | ||
235 | #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT | ||
236 | and a11, a3, a8 # save unalignment offset for below | ||
237 | sub a3, a3, a11 # align a3 | ||
238 | #endif | ||
239 | l32i a6, a3, 0 # load first word | ||
240 | #if XCHAL_HAVE_LOOPS | ||
241 | loopnez a7, .Loop2done | ||
242 | #else /* !XCHAL_HAVE_LOOPS */ | ||
243 | beqz a7, .Loop2done | ||
244 | slli a10, a7, 4 | ||
245 | add a10, a10, a3 # a10 = end of last 16B source chunk | ||
246 | #endif /* !XCHAL_HAVE_LOOPS */ | ||
247 | .Loop2: | ||
248 | l32i a7, a3, 4 | ||
249 | l32i a8, a3, 8 | ||
250 | src_b a6, a6, a7 | ||
251 | s32i a6, a5, 0 | ||
252 | l32i a9, a3, 12 | ||
253 | src_b a7, a7, a8 | ||
254 | s32i a7, a5, 4 | ||
255 | l32i a6, a3, 16 | ||
256 | src_b a8, a8, a9 | ||
257 | s32i a8, a5, 8 | ||
258 | addi a3, a3, 16 | ||
259 | src_b a9, a9, a6 | ||
260 | s32i a9, a5, 12 | ||
261 | addi a5, a5, 16 | ||
262 | #if !XCHAL_HAVE_LOOPS | ||
263 | blt a3, a10, .Loop2 | ||
264 | #endif /* !XCHAL_HAVE_LOOPS */ | ||
265 | .Loop2done: | ||
266 | bbci.l a4, 3, .L12 | ||
267 | # copy 8 bytes | ||
268 | l32i a7, a3, 4 | ||
269 | l32i a8, a3, 8 | ||
270 | src_b a6, a6, a7 | ||
271 | s32i a6, a5, 0 | ||
272 | addi a3, a3, 8 | ||
273 | src_b a7, a7, a8 | ||
274 | s32i a7, a5, 4 | ||
275 | addi a5, a5, 8 | ||
276 | mov a6, a8 | ||
277 | .L12: | ||
278 | bbci.l a4, 2, .L13 | ||
279 | # copy 4 bytes | ||
280 | l32i a7, a3, 4 | ||
281 | addi a3, a3, 4 | ||
282 | src_b a6, a6, a7 | ||
283 | s32i a6, a5, 0 | ||
284 | addi a5, a5, 4 | ||
285 | mov a6, a7 | ||
286 | .L13: | ||
287 | #if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT | ||
288 | add a3, a3, a11 # readjust a3 with correct misalignment | ||
289 | #endif | ||
290 | bbsi.l a4, 1, .L14 | ||
291 | bbsi.l a4, 0, .L15 | ||
292 | .Ldone: retw | ||
293 | .L14: | ||
294 | # copy 2 bytes | ||
295 | l8ui a6, a3, 0 | ||
296 | l8ui a7, a3, 1 | ||
297 | addi a3, a3, 2 | ||
298 | s8i a6, a5, 0 | ||
299 | s8i a7, a5, 1 | ||
300 | addi a5, a5, 2 | ||
301 | bbsi.l a4, 0, .L15 | ||
302 | retw | ||
303 | .L15: | ||
304 | # copy 1 byte | ||
305 | l8ui a6, a3, 0 | ||
306 | s8i a6, a5, 0 | ||
307 | retw | ||
308 | |||
309 | /* | ||
310 | * Local Variables: | ||
311 | * mode:fundamental | ||
312 | * comment-start: "# " | ||
313 | * comment-start-skip: "# *" | ||
314 | * End: | ||
315 | */ | ||