diff options
Diffstat (limited to 'arch/microblaze/lib/fastcopy.S')
-rw-r--r-- | arch/microblaze/lib/fastcopy.S | 662 |
1 files changed, 662 insertions, 0 deletions
diff --git a/arch/microblaze/lib/fastcopy.S b/arch/microblaze/lib/fastcopy.S new file mode 100644 index 000000000000..02e3ab4eddf3 --- /dev/null +++ b/arch/microblaze/lib/fastcopy.S | |||
@@ -0,0 +1,662 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> | ||
3 | * Copyright (C) 2008-2009 PetaLogix | ||
4 | * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. | ||
5 | * | ||
6 | * This file is subject to the terms and conditions of the GNU General | ||
7 | * Public License. See the file COPYING in the main directory of this | ||
8 | * archive for more details. | ||
9 | * | ||
10 | * Written by Jim Law <jlaw@irispower.com> | ||
11 | * | ||
12 | * intended to replace: | ||
13 | * memcpy in memcpy.c and | ||
14 | * memmove in memmove.c | ||
15 | * ... in arch/microblaze/lib | ||
16 | * | ||
17 | * | ||
18 | * assly_fastcopy.S | ||
19 | * | ||
20 | * Attempt at quicker memcpy and memmove for MicroBlaze | ||
21 | * Input : Operand1 in Reg r5 - destination address | ||
22 | * Operand2 in Reg r6 - source address | ||
23 | * Operand3 in Reg r7 - number of bytes to transfer | ||
24 | * Output: Result in Reg r3 - starting destinaition address | ||
25 | * | ||
26 | * | ||
27 | * Explanation: | ||
28 | * Perform (possibly unaligned) copy of a block of memory | ||
29 | * between mem locations with size of xfer spec'd in bytes | ||
30 | */ | ||
31 | |||
32 | #include <linux/linkage.h> | ||
33 | |||
34 | .globl memcpy | ||
35 | .ent memcpy | ||
36 | |||
37 | memcpy: | ||
38 | fast_memcpy_ascending: | ||
39 | /* move d to return register as value of function */ | ||
40 | addi r3, r5, 0 | ||
41 | |||
42 | addi r4, r0, 4 /* n = 4 */ | ||
43 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
44 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ | ||
45 | |||
46 | /* transfer first 0~3 bytes to get aligned dest address */ | ||
47 | andi r4, r5, 3 /* n = d & 3 */ | ||
48 | /* if zero, destination already aligned */ | ||
49 | beqi r4, a_dalign_done | ||
50 | /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ | ||
51 | rsubi r4, r4, 4 | ||
52 | rsub r7, r4, r7 /* c = c - n adjust c */ | ||
53 | |||
54 | a_xfer_first_loop: | ||
55 | /* if no bytes left to transfer, transfer the bulk */ | ||
56 | beqi r4, a_dalign_done | ||
57 | lbui r11, r6, 0 /* h = *s */ | ||
58 | sbi r11, r5, 0 /* *d = h */ | ||
59 | addi r6, r6, 1 /* s++ */ | ||
60 | addi r5, r5, 1 /* d++ */ | ||
61 | brid a_xfer_first_loop /* loop */ | ||
62 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ | ||
63 | |||
64 | a_dalign_done: | ||
65 | addi r4, r0, 32 /* n = 32 */ | ||
66 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
67 | /* if n < 0, less than one block to transfer */ | ||
68 | blti r4, a_block_done | ||
69 | |||
70 | a_block_xfer: | ||
71 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ | ||
72 | rsub r7, r4, r7 /* c = c - n */ | ||
73 | |||
74 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
75 | /* if temp != 0, unaligned transfers needed */ | ||
76 | bnei r9, a_block_unaligned | ||
77 | |||
78 | a_block_aligned: | ||
79 | lwi r9, r6, 0 /* t1 = *(s + 0) */ | ||
80 | lwi r10, r6, 4 /* t2 = *(s + 4) */ | ||
81 | lwi r11, r6, 8 /* t3 = *(s + 8) */ | ||
82 | lwi r12, r6, 12 /* t4 = *(s + 12) */ | ||
83 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
84 | swi r10, r5, 4 /* *(d + 4) = t2 */ | ||
85 | swi r11, r5, 8 /* *(d + 8) = t3 */ | ||
86 | swi r12, r5, 12 /* *(d + 12) = t4 */ | ||
87 | lwi r9, r6, 16 /* t1 = *(s + 16) */ | ||
88 | lwi r10, r6, 20 /* t2 = *(s + 20) */ | ||
89 | lwi r11, r6, 24 /* t3 = *(s + 24) */ | ||
90 | lwi r12, r6, 28 /* t4 = *(s + 28) */ | ||
91 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
92 | swi r10, r5, 20 /* *(d + 20) = t2 */ | ||
93 | swi r11, r5, 24 /* *(d + 24) = t3 */ | ||
94 | swi r12, r5, 28 /* *(d + 28) = t4 */ | ||
95 | addi r6, r6, 32 /* s = s + 32 */ | ||
96 | addi r4, r4, -32 /* n = n - 32 */ | ||
97 | bneid r4, a_block_aligned /* while (n) loop */ | ||
98 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
99 | bri a_block_done | ||
100 | |||
101 | a_block_unaligned: | ||
102 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
103 | add r6, r6, r4 /* s = s + n */ | ||
104 | lwi r11, r8, 0 /* h = *(as + 0) */ | ||
105 | |||
106 | addi r9, r9, -1 | ||
107 | beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ | ||
108 | addi r9, r9, -1 | ||
109 | beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ | ||
110 | |||
111 | a_block_u3: | ||
112 | bslli r11, r11, 24 /* h = h << 24 */ | ||
113 | a_bu3_loop: | ||
114 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
115 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
116 | or r9, r11, r9 /* t1 = h | t1 */ | ||
117 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
118 | bslli r11, r12, 24 /* h = v << 24 */ | ||
119 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
120 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
121 | or r9, r11, r9 /* t1 = h | t1 */ | ||
122 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
123 | bslli r11, r12, 24 /* h = v << 24 */ | ||
124 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
125 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
126 | or r9, r11, r9 /* t1 = h | t1 */ | ||
127 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
128 | bslli r11, r12, 24 /* h = v << 24 */ | ||
129 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
130 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
131 | or r9, r11, r9 /* t1 = h | t1 */ | ||
132 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
133 | bslli r11, r12, 24 /* h = v << 24 */ | ||
134 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
135 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
136 | or r9, r11, r9 /* t1 = h | t1 */ | ||
137 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
138 | bslli r11, r12, 24 /* h = v << 24 */ | ||
139 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
140 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
141 | or r9, r11, r9 /* t1 = h | t1 */ | ||
142 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
143 | bslli r11, r12, 24 /* h = v << 24 */ | ||
144 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
145 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
146 | or r9, r11, r9 /* t1 = h | t1 */ | ||
147 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
148 | bslli r11, r12, 24 /* h = v << 24 */ | ||
149 | lwi r12, r8, 32 /* v = *(as + 32) */ | ||
150 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
151 | or r9, r11, r9 /* t1 = h | t1 */ | ||
152 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
153 | bslli r11, r12, 24 /* h = v << 24 */ | ||
154 | addi r8, r8, 32 /* as = as + 32 */ | ||
155 | addi r4, r4, -32 /* n = n - 32 */ | ||
156 | bneid r4, a_bu3_loop /* while (n) loop */ | ||
157 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
158 | bri a_block_done | ||
159 | |||
160 | a_block_u1: | ||
161 | bslli r11, r11, 8 /* h = h << 8 */ | ||
162 | a_bu1_loop: | ||
163 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
164 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
165 | or r9, r11, r9 /* t1 = h | t1 */ | ||
166 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
167 | bslli r11, r12, 8 /* h = v << 8 */ | ||
168 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
169 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
170 | or r9, r11, r9 /* t1 = h | t1 */ | ||
171 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
172 | bslli r11, r12, 8 /* h = v << 8 */ | ||
173 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
174 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
175 | or r9, r11, r9 /* t1 = h | t1 */ | ||
176 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
177 | bslli r11, r12, 8 /* h = v << 8 */ | ||
178 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
179 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
180 | or r9, r11, r9 /* t1 = h | t1 */ | ||
181 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
182 | bslli r11, r12, 8 /* h = v << 8 */ | ||
183 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
184 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
185 | or r9, r11, r9 /* t1 = h | t1 */ | ||
186 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
187 | bslli r11, r12, 8 /* h = v << 8 */ | ||
188 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
189 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
190 | or r9, r11, r9 /* t1 = h | t1 */ | ||
191 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
192 | bslli r11, r12, 8 /* h = v << 8 */ | ||
193 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
194 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
195 | or r9, r11, r9 /* t1 = h | t1 */ | ||
196 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
197 | bslli r11, r12, 8 /* h = v << 8 */ | ||
198 | lwi r12, r8, 32 /* v = *(as + 32) */ | ||
199 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
200 | or r9, r11, r9 /* t1 = h | t1 */ | ||
201 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
202 | bslli r11, r12, 8 /* h = v << 8 */ | ||
203 | addi r8, r8, 32 /* as = as + 32 */ | ||
204 | addi r4, r4, -32 /* n = n - 32 */ | ||
205 | bneid r4, a_bu1_loop /* while (n) loop */ | ||
206 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
207 | bri a_block_done | ||
208 | |||
209 | a_block_u2: | ||
210 | bslli r11, r11, 16 /* h = h << 16 */ | ||
211 | a_bu2_loop: | ||
212 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
213 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
214 | or r9, r11, r9 /* t1 = h | t1 */ | ||
215 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
216 | bslli r11, r12, 16 /* h = v << 16 */ | ||
217 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
218 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
219 | or r9, r11, r9 /* t1 = h | t1 */ | ||
220 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
221 | bslli r11, r12, 16 /* h = v << 16 */ | ||
222 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
223 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
224 | or r9, r11, r9 /* t1 = h | t1 */ | ||
225 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
226 | bslli r11, r12, 16 /* h = v << 16 */ | ||
227 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
228 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
229 | or r9, r11, r9 /* t1 = h | t1 */ | ||
230 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
231 | bslli r11, r12, 16 /* h = v << 16 */ | ||
232 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
233 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
234 | or r9, r11, r9 /* t1 = h | t1 */ | ||
235 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
236 | bslli r11, r12, 16 /* h = v << 16 */ | ||
237 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
238 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
239 | or r9, r11, r9 /* t1 = h | t1 */ | ||
240 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
241 | bslli r11, r12, 16 /* h = v << 16 */ | ||
242 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
243 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
244 | or r9, r11, r9 /* t1 = h | t1 */ | ||
245 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
246 | bslli r11, r12, 16 /* h = v << 16 */ | ||
247 | lwi r12, r8, 32 /* v = *(as + 32) */ | ||
248 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
249 | or r9, r11, r9 /* t1 = h | t1 */ | ||
250 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
251 | bslli r11, r12, 16 /* h = v << 16 */ | ||
252 | addi r8, r8, 32 /* as = as + 32 */ | ||
253 | addi r4, r4, -32 /* n = n - 32 */ | ||
254 | bneid r4, a_bu2_loop /* while (n) loop */ | ||
255 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
256 | |||
257 | a_block_done: | ||
258 | addi r4, r0, 4 /* n = 4 */ | ||
259 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
260 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ | ||
261 | |||
262 | a_word_xfer: | ||
263 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ | ||
264 | addi r10, r0, 0 /* offset = 0 */ | ||
265 | |||
266 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
267 | /* if temp != 0, unaligned transfers needed */ | ||
268 | bnei r9, a_word_unaligned | ||
269 | |||
270 | a_word_aligned: | ||
271 | lw r9, r6, r10 /* t1 = *(s+offset) */ | ||
272 | sw r9, r5, r10 /* *(d+offset) = t1 */ | ||
273 | addi r4, r4,-4 /* n-- */ | ||
274 | bneid r4, a_word_aligned /* loop */ | ||
275 | addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ | ||
276 | |||
277 | bri a_word_done | ||
278 | |||
279 | a_word_unaligned: | ||
280 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
281 | lwi r11, r8, 0 /* h = *(as + 0) */ | ||
282 | addi r8, r8, 4 /* as = as + 4 */ | ||
283 | |||
284 | addi r9, r9, -1 | ||
285 | beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ | ||
286 | addi r9, r9, -1 | ||
287 | beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ | ||
288 | |||
289 | a_word_u3: | ||
290 | bslli r11, r11, 24 /* h = h << 24 */ | ||
291 | a_wu3_loop: | ||
292 | lw r12, r8, r10 /* v = *(as + offset) */ | ||
293 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
294 | or r9, r11, r9 /* t1 = h | t1 */ | ||
295 | sw r9, r5, r10 /* *(d + offset) = t1 */ | ||
296 | bslli r11, r12, 24 /* h = v << 24 */ | ||
297 | addi r4, r4,-4 /* n = n - 4 */ | ||
298 | bneid r4, a_wu3_loop /* while (n) loop */ | ||
299 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | ||
300 | |||
301 | bri a_word_done | ||
302 | |||
303 | a_word_u1: | ||
304 | bslli r11, r11, 8 /* h = h << 8 */ | ||
305 | a_wu1_loop: | ||
306 | lw r12, r8, r10 /* v = *(as + offset) */ | ||
307 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
308 | or r9, r11, r9 /* t1 = h | t1 */ | ||
309 | sw r9, r5, r10 /* *(d + offset) = t1 */ | ||
310 | bslli r11, r12, 8 /* h = v << 8 */ | ||
311 | addi r4, r4,-4 /* n = n - 4 */ | ||
312 | bneid r4, a_wu1_loop /* while (n) loop */ | ||
313 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | ||
314 | |||
315 | bri a_word_done | ||
316 | |||
317 | a_word_u2: | ||
318 | bslli r11, r11, 16 /* h = h << 16 */ | ||
319 | a_wu2_loop: | ||
320 | lw r12, r8, r10 /* v = *(as + offset) */ | ||
321 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
322 | or r9, r11, r9 /* t1 = h | t1 */ | ||
323 | sw r9, r5, r10 /* *(d + offset) = t1 */ | ||
324 | bslli r11, r12, 16 /* h = v << 16 */ | ||
325 | addi r4, r4,-4 /* n = n - 4 */ | ||
326 | bneid r4, a_wu2_loop /* while (n) loop */ | ||
327 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | ||
328 | |||
329 | a_word_done: | ||
330 | add r5, r5, r10 /* d = d + offset */ | ||
331 | add r6, r6, r10 /* s = s + offset */ | ||
332 | rsub r7, r10, r7 /* c = c - offset */ | ||
333 | |||
334 | a_xfer_end: | ||
335 | a_xfer_end_loop: | ||
336 | beqi r7, a_done /* while (c) */ | ||
337 | lbui r9, r6, 0 /* t1 = *s */ | ||
338 | addi r6, r6, 1 /* s++ */ | ||
339 | sbi r9, r5, 0 /* *d = t1 */ | ||
340 | addi r7, r7, -1 /* c-- */ | ||
341 | brid a_xfer_end_loop /* loop */ | ||
342 | addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ | ||
343 | |||
344 | a_done: | ||
345 | rtsd r15, 8 | ||
346 | nop | ||
347 | |||
348 | .end memcpy | ||
349 | /*----------------------------------------------------------------------------*/ | ||
350 | .globl memmove | ||
351 | .ent memmove | ||
352 | |||
353 | memmove: | ||
354 | cmpu r4, r5, r6 /* n = s - d */ | ||
355 | bgei r4,fast_memcpy_ascending | ||
356 | |||
357 | fast_memcpy_descending: | ||
358 | /* move d to return register as value of function */ | ||
359 | addi r3, r5, 0 | ||
360 | |||
361 | add r5, r5, r7 /* d = d + c */ | ||
362 | add r6, r6, r7 /* s = s + c */ | ||
363 | |||
364 | addi r4, r0, 4 /* n = 4 */ | ||
365 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
366 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ | ||
367 | |||
368 | /* transfer first 0~3 bytes to get aligned dest address */ | ||
369 | andi r4, r5, 3 /* n = d & 3 */ | ||
370 | /* if zero, destination already aligned */ | ||
371 | beqi r4,d_dalign_done | ||
372 | rsub r7, r4, r7 /* c = c - n adjust c */ | ||
373 | |||
374 | d_xfer_first_loop: | ||
375 | /* if no bytes left to transfer, transfer the bulk */ | ||
376 | beqi r4,d_dalign_done | ||
377 | addi r6, r6, -1 /* s-- */ | ||
378 | addi r5, r5, -1 /* d-- */ | ||
379 | lbui r11, r6, 0 /* h = *s */ | ||
380 | sbi r11, r5, 0 /* *d = h */ | ||
381 | brid d_xfer_first_loop /* loop */ | ||
382 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ | ||
383 | |||
384 | d_dalign_done: | ||
385 | addi r4, r0, 32 /* n = 32 */ | ||
386 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
387 | /* if n < 0, less than one block to transfer */ | ||
388 | blti r4, d_block_done | ||
389 | |||
390 | d_block_xfer: | ||
391 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ | ||
392 | rsub r7, r4, r7 /* c = c - n */ | ||
393 | |||
394 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
395 | /* if temp != 0, unaligned transfers needed */ | ||
396 | bnei r9, d_block_unaligned | ||
397 | |||
398 | d_block_aligned: | ||
399 | addi r6, r6, -32 /* s = s - 32 */ | ||
400 | addi r5, r5, -32 /* d = d - 32 */ | ||
401 | lwi r9, r6, 28 /* t1 = *(s + 28) */ | ||
402 | lwi r10, r6, 24 /* t2 = *(s + 24) */ | ||
403 | lwi r11, r6, 20 /* t3 = *(s + 20) */ | ||
404 | lwi r12, r6, 16 /* t4 = *(s + 16) */ | ||
405 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
406 | swi r10, r5, 24 /* *(d + 24) = t2 */ | ||
407 | swi r11, r5, 20 /* *(d + 20) = t3 */ | ||
408 | swi r12, r5, 16 /* *(d + 16) = t4 */ | ||
409 | lwi r9, r6, 12 /* t1 = *(s + 12) */ | ||
410 | lwi r10, r6, 8 /* t2 = *(s + 8) */ | ||
411 | lwi r11, r6, 4 /* t3 = *(s + 4) */ | ||
412 | lwi r12, r6, 0 /* t4 = *(s + 0) */ | ||
413 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
414 | swi r10, r5, 8 /* *(d + 8) = t2 */ | ||
415 | swi r11, r5, 4 /* *(d + 4) = t3 */ | ||
416 | addi r4, r4, -32 /* n = n - 32 */ | ||
417 | bneid r4, d_block_aligned /* while (n) loop */ | ||
418 | swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ | ||
419 | bri d_block_done | ||
420 | |||
421 | d_block_unaligned: | ||
422 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
423 | rsub r6, r4, r6 /* s = s - n */ | ||
424 | lwi r11, r8, 0 /* h = *(as + 0) */ | ||
425 | |||
426 | addi r9, r9, -1 | ||
427 | beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ | ||
428 | addi r9, r9, -1 | ||
429 | beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ | ||
430 | |||
431 | d_block_u3: | ||
432 | bsrli r11, r11, 8 /* h = h >> 8 */ | ||
433 | d_bu3_loop: | ||
434 | addi r8, r8, -32 /* as = as - 32 */ | ||
435 | addi r5, r5, -32 /* d = d - 32 */ | ||
436 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
437 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
438 | or r9, r11, r9 /* t1 = h | t1 */ | ||
439 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
440 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
441 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
442 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
443 | or r9, r11, r9 /* t1 = h | t1 */ | ||
444 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
445 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
446 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
447 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
448 | or r9, r11, r9 /* t1 = h | t1 */ | ||
449 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
450 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
451 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
452 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
453 | or r9, r11, r9 /* t1 = h | t1 */ | ||
454 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
455 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
456 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
457 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
458 | or r9, r11, r9 /* t1 = h | t1 */ | ||
459 | swi r9, r5, 12 /* *(d + 112) = t1 */ | ||
460 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
461 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
462 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
463 | or r9, r11, r9 /* t1 = h | t1 */ | ||
464 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
465 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
466 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
467 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
468 | or r9, r11, r9 /* t1 = h | t1 */ | ||
469 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
470 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
471 | lwi r12, r8, 0 /* v = *(as + 0) */ | ||
472 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
473 | or r9, r11, r9 /* t1 = h | t1 */ | ||
474 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
475 | addi r4, r4, -32 /* n = n - 32 */ | ||
476 | bneid r4, d_bu3_loop /* while (n) loop */ | ||
477 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ | ||
478 | bri d_block_done | ||
479 | |||
480 | d_block_u1: | ||
481 | bsrli r11, r11, 24 /* h = h >> 24 */ | ||
482 | d_bu1_loop: | ||
483 | addi r8, r8, -32 /* as = as - 32 */ | ||
484 | addi r5, r5, -32 /* d = d - 32 */ | ||
485 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
486 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
487 | or r9, r11, r9 /* t1 = h | t1 */ | ||
488 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
489 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
490 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
491 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
492 | or r9, r11, r9 /* t1 = h | t1 */ | ||
493 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
494 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
495 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
496 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
497 | or r9, r11, r9 /* t1 = h | t1 */ | ||
498 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
499 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
500 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
501 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
502 | or r9, r11, r9 /* t1 = h | t1 */ | ||
503 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
504 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
505 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
506 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
507 | or r9, r11, r9 /* t1 = h | t1 */ | ||
508 | swi r9, r5, 12 /* *(d + 112) = t1 */ | ||
509 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
510 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
511 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
512 | or r9, r11, r9 /* t1 = h | t1 */ | ||
513 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
514 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
515 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
516 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
517 | or r9, r11, r9 /* t1 = h | t1 */ | ||
518 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
519 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
520 | lwi r12, r8, 0 /* v = *(as + 0) */ | ||
521 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
522 | or r9, r11, r9 /* t1 = h | t1 */ | ||
523 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
524 | addi r4, r4, -32 /* n = n - 32 */ | ||
525 | bneid r4, d_bu1_loop /* while (n) loop */ | ||
526 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ | ||
527 | bri d_block_done | ||
528 | |||
529 | d_block_u2: | ||
530 | bsrli r11, r11, 16 /* h = h >> 16 */ | ||
531 | d_bu2_loop: | ||
532 | addi r8, r8, -32 /* as = as - 32 */ | ||
533 | addi r5, r5, -32 /* d = d - 32 */ | ||
534 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
535 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
536 | or r9, r11, r9 /* t1 = h | t1 */ | ||
537 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
538 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
539 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
540 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
541 | or r9, r11, r9 /* t1 = h | t1 */ | ||
542 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
543 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
544 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
545 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
546 | or r9, r11, r9 /* t1 = h | t1 */ | ||
547 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
548 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
549 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
550 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
551 | or r9, r11, r9 /* t1 = h | t1 */ | ||
552 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
553 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
554 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
555 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
556 | or r9, r11, r9 /* t1 = h | t1 */ | ||
557 | swi r9, r5, 12 /* *(d + 112) = t1 */ | ||
558 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
559 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
560 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
561 | or r9, r11, r9 /* t1 = h | t1 */ | ||
562 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
563 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
564 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
565 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
566 | or r9, r11, r9 /* t1 = h | t1 */ | ||
567 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
568 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
569 | lwi r12, r8, 0 /* v = *(as + 0) */ | ||
570 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
571 | or r9, r11, r9 /* t1 = h | t1 */ | ||
572 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
573 | addi r4, r4, -32 /* n = n - 32 */ | ||
574 | bneid r4, d_bu2_loop /* while (n) loop */ | ||
575 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ | ||
576 | |||
577 | d_block_done: | ||
578 | addi r4, r0, 4 /* n = 4 */ | ||
579 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
580 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ | ||
581 | |||
582 | d_word_xfer: | ||
583 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ | ||
584 | rsub r5, r4, r5 /* d = d - n */ | ||
585 | rsub r6, r4, r6 /* s = s - n */ | ||
586 | rsub r7, r4, r7 /* c = c - n */ | ||
587 | |||
588 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
589 | /* if temp != 0, unaligned transfers needed */ | ||
590 | bnei r9, d_word_unaligned | ||
591 | |||
592 | d_word_aligned: | ||
593 | addi r4, r4,-4 /* n-- */ | ||
594 | lw r9, r6, r4 /* t1 = *(s+n) */ | ||
595 | bneid r4, d_word_aligned /* loop */ | ||
596 | sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ | ||
597 | |||
598 | bri d_word_done | ||
599 | |||
600 | d_word_unaligned: | ||
601 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
602 | lw r11, r8, r4 /* h = *(as + n) */ | ||
603 | |||
604 | addi r9, r9, -1 | ||
605 | beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ | ||
606 | addi r9, r9, -1 | ||
607 | beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ | ||
608 | |||
609 | d_word_u3: | ||
610 | bsrli r11, r11, 8 /* h = h >> 8 */ | ||
611 | d_wu3_loop: | ||
612 | addi r4, r4,-4 /* n = n - 4 */ | ||
613 | lw r12, r8, r4 /* v = *(as + n) */ | ||
614 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
615 | or r9, r11, r9 /* t1 = h | t1 */ | ||
616 | sw r9, r5, r4 /* *(d + n) = t1 */ | ||
617 | bneid r4, d_wu3_loop /* while (n) loop */ | ||
618 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ | ||
619 | |||
620 | bri d_word_done | ||
621 | |||
622 | d_word_u1: | ||
623 | bsrli r11, r11, 24 /* h = h >> 24 */ | ||
624 | d_wu1_loop: | ||
625 | addi r4, r4,-4 /* n = n - 4 */ | ||
626 | lw r12, r8, r4 /* v = *(as + n) */ | ||
627 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
628 | or r9, r11, r9 /* t1 = h | t1 */ | ||
629 | sw r9, r5, r4 /* *(d + n) = t1 */ | ||
630 | bneid r4, d_wu1_loop /* while (n) loop */ | ||
631 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ | ||
632 | |||
633 | bri d_word_done | ||
634 | |||
635 | d_word_u2: | ||
636 | bsrli r11, r11, 16 /* h = h >> 16 */ | ||
637 | d_wu2_loop: | ||
638 | addi r4, r4,-4 /* n = n - 4 */ | ||
639 | lw r12, r8, r4 /* v = *(as + n) */ | ||
640 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
641 | or r9, r11, r9 /* t1 = h | t1 */ | ||
642 | sw r9, r5, r4 /* *(d + n) = t1 */ | ||
643 | bneid r4, d_wu2_loop /* while (n) loop */ | ||
644 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ | ||
645 | |||
646 | d_word_done: | ||
647 | |||
648 | d_xfer_end: | ||
649 | d_xfer_end_loop: | ||
650 | beqi r7, a_done /* while (c) */ | ||
651 | addi r6, r6, -1 /* s-- */ | ||
652 | lbui r9, r6, 0 /* t1 = *s */ | ||
653 | addi r5, r5, -1 /* d-- */ | ||
654 | sbi r9, r5, 0 /* *d = t1 */ | ||
655 | brid d_xfer_end_loop /* loop */ | ||
656 | addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ | ||
657 | |||
658 | d_done: | ||
659 | rtsd r15, 8 | ||
660 | nop | ||
661 | |||
662 | .end memmove | ||