aboutsummaryrefslogtreecommitdiffstats
path: root/arch/microblaze/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/microblaze/lib')
-rw-r--r--arch/microblaze/lib/fastcopy.S662
-rw-r--r--arch/microblaze/lib/memcpy.c161
-rw-r--r--arch/microblaze/lib/memmove.c175
-rw-r--r--arch/microblaze/lib/memset.c82
4 files changed, 1080 insertions, 0 deletions
diff --git a/arch/microblaze/lib/fastcopy.S b/arch/microblaze/lib/fastcopy.S
new file mode 100644
index 000000000000..02e3ab4eddf3
--- /dev/null
+++ b/arch/microblaze/lib/fastcopy.S
@@ -0,0 +1,662 @@
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
5 *
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
9 *
10 * Written by Jim Law <jlaw@irispower.com>
11 *
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
16 *
17 *
18 * assly_fastcopy.S
19 *
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
25 *
26 *
27 * Explanation:
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
30 */
31
32#include <linux/linkage.h>
33
34 .globl memcpy
35 .ent memcpy
36
37memcpy:
38fast_memcpy_ascending:
39 /* move d to return register as value of function */
40 addi r3, r5, 0
41
42 addi r4, r0, 4 /* n = 4 */
43 cmpu r4, r4, r7 /* n = c - n (unsigned) */
44 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
45
46 /* transfer first 0~3 bytes to get aligned dest address */
47 andi r4, r5, 3 /* n = d & 3 */
48 /* if zero, destination already aligned */
49 beqi r4, a_dalign_done
50 /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */
51 rsubi r4, r4, 4
52 rsub r7, r4, r7 /* c = c - n adjust c */
53
54a_xfer_first_loop:
55 /* if no bytes left to transfer, transfer the bulk */
56 beqi r4, a_dalign_done
57 lbui r11, r6, 0 /* h = *s */
58 sbi r11, r5, 0 /* *d = h */
59 addi r6, r6, 1 /* s++ */
60 addi r5, r5, 1 /* d++ */
61 brid a_xfer_first_loop /* loop */
62 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
63
64a_dalign_done:
65 addi r4, r0, 32 /* n = 32 */
66 cmpu r4, r4, r7 /* n = c - n (unsigned) */
67 /* if n < 0, less than one block to transfer */
68 blti r4, a_block_done
69
70a_block_xfer:
71 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
72 rsub r7, r4, r7 /* c = c - n */
73
74 andi r9, r6, 3 /* t1 = s & 3 */
75 /* if temp != 0, unaligned transfers needed */
76 bnei r9, a_block_unaligned
77
78a_block_aligned:
79 lwi r9, r6, 0 /* t1 = *(s + 0) */
80 lwi r10, r6, 4 /* t2 = *(s + 4) */
81 lwi r11, r6, 8 /* t3 = *(s + 8) */
82 lwi r12, r6, 12 /* t4 = *(s + 12) */
83 swi r9, r5, 0 /* *(d + 0) = t1 */
84 swi r10, r5, 4 /* *(d + 4) = t2 */
85 swi r11, r5, 8 /* *(d + 8) = t3 */
86 swi r12, r5, 12 /* *(d + 12) = t4 */
87 lwi r9, r6, 16 /* t1 = *(s + 16) */
88 lwi r10, r6, 20 /* t2 = *(s + 20) */
89 lwi r11, r6, 24 /* t3 = *(s + 24) */
90 lwi r12, r6, 28 /* t4 = *(s + 28) */
91 swi r9, r5, 16 /* *(d + 16) = t1 */
92 swi r10, r5, 20 /* *(d + 20) = t2 */
93 swi r11, r5, 24 /* *(d + 24) = t3 */
94 swi r12, r5, 28 /* *(d + 28) = t4 */
95 addi r6, r6, 32 /* s = s + 32 */
96 addi r4, r4, -32 /* n = n - 32 */
97 bneid r4, a_block_aligned /* while (n) loop */
98 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
99 bri a_block_done
100
101a_block_unaligned:
102 andi r8, r6, 0xfffffffc /* as = s & ~3 */
103 add r6, r6, r4 /* s = s + n */
104 lwi r11, r8, 0 /* h = *(as + 0) */
105
106 addi r9, r9, -1
107 beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */
108 addi r9, r9, -1
109 beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */
110
111a_block_u3:
112 bslli r11, r11, 24 /* h = h << 24 */
113a_bu3_loop:
114 lwi r12, r8, 4 /* v = *(as + 4) */
115 bsrli r9, r12, 8 /* t1 = v >> 8 */
116 or r9, r11, r9 /* t1 = h | t1 */
117 swi r9, r5, 0 /* *(d + 0) = t1 */
118 bslli r11, r12, 24 /* h = v << 24 */
119 lwi r12, r8, 8 /* v = *(as + 8) */
120 bsrli r9, r12, 8 /* t1 = v >> 8 */
121 or r9, r11, r9 /* t1 = h | t1 */
122 swi r9, r5, 4 /* *(d + 4) = t1 */
123 bslli r11, r12, 24 /* h = v << 24 */
124 lwi r12, r8, 12 /* v = *(as + 12) */
125 bsrli r9, r12, 8 /* t1 = v >> 8 */
126 or r9, r11, r9 /* t1 = h | t1 */
127 swi r9, r5, 8 /* *(d + 8) = t1 */
128 bslli r11, r12, 24 /* h = v << 24 */
129 lwi r12, r8, 16 /* v = *(as + 16) */
130 bsrli r9, r12, 8 /* t1 = v >> 8 */
131 or r9, r11, r9 /* t1 = h | t1 */
132 swi r9, r5, 12 /* *(d + 12) = t1 */
133 bslli r11, r12, 24 /* h = v << 24 */
134 lwi r12, r8, 20 /* v = *(as + 20) */
135 bsrli r9, r12, 8 /* t1 = v >> 8 */
136 or r9, r11, r9 /* t1 = h | t1 */
137 swi r9, r5, 16 /* *(d + 16) = t1 */
138 bslli r11, r12, 24 /* h = v << 24 */
139 lwi r12, r8, 24 /* v = *(as + 24) */
140 bsrli r9, r12, 8 /* t1 = v >> 8 */
141 or r9, r11, r9 /* t1 = h | t1 */
142 swi r9, r5, 20 /* *(d + 20) = t1 */
143 bslli r11, r12, 24 /* h = v << 24 */
144 lwi r12, r8, 28 /* v = *(as + 28) */
145 bsrli r9, r12, 8 /* t1 = v >> 8 */
146 or r9, r11, r9 /* t1 = h | t1 */
147 swi r9, r5, 24 /* *(d + 24) = t1 */
148 bslli r11, r12, 24 /* h = v << 24 */
149 lwi r12, r8, 32 /* v = *(as + 32) */
150 bsrli r9, r12, 8 /* t1 = v >> 8 */
151 or r9, r11, r9 /* t1 = h | t1 */
152 swi r9, r5, 28 /* *(d + 28) = t1 */
153 bslli r11, r12, 24 /* h = v << 24 */
154 addi r8, r8, 32 /* as = as + 32 */
155 addi r4, r4, -32 /* n = n - 32 */
156 bneid r4, a_bu3_loop /* while (n) loop */
157 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
158 bri a_block_done
159
160a_block_u1:
161 bslli r11, r11, 8 /* h = h << 8 */
162a_bu1_loop:
163 lwi r12, r8, 4 /* v = *(as + 4) */
164 bsrli r9, r12, 24 /* t1 = v >> 24 */
165 or r9, r11, r9 /* t1 = h | t1 */
166 swi r9, r5, 0 /* *(d + 0) = t1 */
167 bslli r11, r12, 8 /* h = v << 8 */
168 lwi r12, r8, 8 /* v = *(as + 8) */
169 bsrli r9, r12, 24 /* t1 = v >> 24 */
170 or r9, r11, r9 /* t1 = h | t1 */
171 swi r9, r5, 4 /* *(d + 4) = t1 */
172 bslli r11, r12, 8 /* h = v << 8 */
173 lwi r12, r8, 12 /* v = *(as + 12) */
174 bsrli r9, r12, 24 /* t1 = v >> 24 */
175 or r9, r11, r9 /* t1 = h | t1 */
176 swi r9, r5, 8 /* *(d + 8) = t1 */
177 bslli r11, r12, 8 /* h = v << 8 */
178 lwi r12, r8, 16 /* v = *(as + 16) */
179 bsrli r9, r12, 24 /* t1 = v >> 24 */
180 or r9, r11, r9 /* t1 = h | t1 */
181 swi r9, r5, 12 /* *(d + 12) = t1 */
182 bslli r11, r12, 8 /* h = v << 8 */
183 lwi r12, r8, 20 /* v = *(as + 20) */
184 bsrli r9, r12, 24 /* t1 = v >> 24 */
185 or r9, r11, r9 /* t1 = h | t1 */
186 swi r9, r5, 16 /* *(d + 16) = t1 */
187 bslli r11, r12, 8 /* h = v << 8 */
188 lwi r12, r8, 24 /* v = *(as + 24) */
189 bsrli r9, r12, 24 /* t1 = v >> 24 */
190 or r9, r11, r9 /* t1 = h | t1 */
191 swi r9, r5, 20 /* *(d + 20) = t1 */
192 bslli r11, r12, 8 /* h = v << 8 */
193 lwi r12, r8, 28 /* v = *(as + 28) */
194 bsrli r9, r12, 24 /* t1 = v >> 24 */
195 or r9, r11, r9 /* t1 = h | t1 */
196 swi r9, r5, 24 /* *(d + 24) = t1 */
197 bslli r11, r12, 8 /* h = v << 8 */
198 lwi r12, r8, 32 /* v = *(as + 32) */
199 bsrli r9, r12, 24 /* t1 = v >> 24 */
200 or r9, r11, r9 /* t1 = h | t1 */
201 swi r9, r5, 28 /* *(d + 28) = t1 */
202 bslli r11, r12, 8 /* h = v << 8 */
203 addi r8, r8, 32 /* as = as + 32 */
204 addi r4, r4, -32 /* n = n - 32 */
205 bneid r4, a_bu1_loop /* while (n) loop */
206 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
207 bri a_block_done
208
209a_block_u2:
210 bslli r11, r11, 16 /* h = h << 16 */
211a_bu2_loop:
212 lwi r12, r8, 4 /* v = *(as + 4) */
213 bsrli r9, r12, 16 /* t1 = v >> 16 */
214 or r9, r11, r9 /* t1 = h | t1 */
215 swi r9, r5, 0 /* *(d + 0) = t1 */
216 bslli r11, r12, 16 /* h = v << 16 */
217 lwi r12, r8, 8 /* v = *(as + 8) */
218 bsrli r9, r12, 16 /* t1 = v >> 16 */
219 or r9, r11, r9 /* t1 = h | t1 */
220 swi r9, r5, 4 /* *(d + 4) = t1 */
221 bslli r11, r12, 16 /* h = v << 16 */
222 lwi r12, r8, 12 /* v = *(as + 12) */
223 bsrli r9, r12, 16 /* t1 = v >> 16 */
224 or r9, r11, r9 /* t1 = h | t1 */
225 swi r9, r5, 8 /* *(d + 8) = t1 */
226 bslli r11, r12, 16 /* h = v << 16 */
227 lwi r12, r8, 16 /* v = *(as + 16) */
228 bsrli r9, r12, 16 /* t1 = v >> 16 */
229 or r9, r11, r9 /* t1 = h | t1 */
230 swi r9, r5, 12 /* *(d + 12) = t1 */
231 bslli r11, r12, 16 /* h = v << 16 */
232 lwi r12, r8, 20 /* v = *(as + 20) */
233 bsrli r9, r12, 16 /* t1 = v >> 16 */
234 or r9, r11, r9 /* t1 = h | t1 */
235 swi r9, r5, 16 /* *(d + 16) = t1 */
236 bslli r11, r12, 16 /* h = v << 16 */
237 lwi r12, r8, 24 /* v = *(as + 24) */
238 bsrli r9, r12, 16 /* t1 = v >> 16 */
239 or r9, r11, r9 /* t1 = h | t1 */
240 swi r9, r5, 20 /* *(d + 20) = t1 */
241 bslli r11, r12, 16 /* h = v << 16 */
242 lwi r12, r8, 28 /* v = *(as + 28) */
243 bsrli r9, r12, 16 /* t1 = v >> 16 */
244 or r9, r11, r9 /* t1 = h | t1 */
245 swi r9, r5, 24 /* *(d + 24) = t1 */
246 bslli r11, r12, 16 /* h = v << 16 */
247 lwi r12, r8, 32 /* v = *(as + 32) */
248 bsrli r9, r12, 16 /* t1 = v >> 16 */
249 or r9, r11, r9 /* t1 = h | t1 */
250 swi r9, r5, 28 /* *(d + 28) = t1 */
251 bslli r11, r12, 16 /* h = v << 16 */
252 addi r8, r8, 32 /* as = as + 32 */
253 addi r4, r4, -32 /* n = n - 32 */
254 bneid r4, a_bu2_loop /* while (n) loop */
255 addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */
256
257a_block_done:
258 addi r4, r0, 4 /* n = 4 */
259 cmpu r4, r4, r7 /* n = c - n (unsigned) */
260 blti r4, a_xfer_end /* if n < 0, less than one word to transfer */
261
262a_word_xfer:
263 andi r4, r7, 0xfffffffc /* n = c & ~3 */
264 addi r10, r0, 0 /* offset = 0 */
265
266 andi r9, r6, 3 /* t1 = s & 3 */
267 /* if temp != 0, unaligned transfers needed */
268 bnei r9, a_word_unaligned
269
270a_word_aligned:
271 lw r9, r6, r10 /* t1 = *(s+offset) */
272 sw r9, r5, r10 /* *(d+offset) = t1 */
273 addi r4, r4,-4 /* n-- */
274 bneid r4, a_word_aligned /* loop */
275 addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */
276
277 bri a_word_done
278
279a_word_unaligned:
280 andi r8, r6, 0xfffffffc /* as = s & ~3 */
281 lwi r11, r8, 0 /* h = *(as + 0) */
282 addi r8, r8, 4 /* as = as + 4 */
283
284 addi r9, r9, -1
285 beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */
286 addi r9, r9, -1
287 beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */
288
289a_word_u3:
290 bslli r11, r11, 24 /* h = h << 24 */
291a_wu3_loop:
292 lw r12, r8, r10 /* v = *(as + offset) */
293 bsrli r9, r12, 8 /* t1 = v >> 8 */
294 or r9, r11, r9 /* t1 = h | t1 */
295 sw r9, r5, r10 /* *(d + offset) = t1 */
296 bslli r11, r12, 24 /* h = v << 24 */
297 addi r4, r4,-4 /* n = n - 4 */
298 bneid r4, a_wu3_loop /* while (n) loop */
299 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
300
301 bri a_word_done
302
303a_word_u1:
304 bslli r11, r11, 8 /* h = h << 8 */
305a_wu1_loop:
306 lw r12, r8, r10 /* v = *(as + offset) */
307 bsrli r9, r12, 24 /* t1 = v >> 24 */
308 or r9, r11, r9 /* t1 = h | t1 */
309 sw r9, r5, r10 /* *(d + offset) = t1 */
310 bslli r11, r12, 8 /* h = v << 8 */
311 addi r4, r4,-4 /* n = n - 4 */
312 bneid r4, a_wu1_loop /* while (n) loop */
313 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
314
315 bri a_word_done
316
317a_word_u2:
318 bslli r11, r11, 16 /* h = h << 16 */
319a_wu2_loop:
320 lw r12, r8, r10 /* v = *(as + offset) */
321 bsrli r9, r12, 16 /* t1 = v >> 16 */
322 or r9, r11, r9 /* t1 = h | t1 */
323 sw r9, r5, r10 /* *(d + offset) = t1 */
324 bslli r11, r12, 16 /* h = v << 16 */
325 addi r4, r4,-4 /* n = n - 4 */
326 bneid r4, a_wu2_loop /* while (n) loop */
327 addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */
328
329a_word_done:
330 add r5, r5, r10 /* d = d + offset */
331 add r6, r6, r10 /* s = s + offset */
332 rsub r7, r10, r7 /* c = c - offset */
333
334a_xfer_end:
335a_xfer_end_loop:
336 beqi r7, a_done /* while (c) */
337 lbui r9, r6, 0 /* t1 = *s */
338 addi r6, r6, 1 /* s++ */
339 sbi r9, r5, 0 /* *d = t1 */
340 addi r7, r7, -1 /* c-- */
341 brid a_xfer_end_loop /* loop */
342 addi r5, r5, 1 /* d++ (IN DELAY SLOT) */
343
344a_done:
345 rtsd r15, 8
346 nop
347
348.end memcpy
349/*----------------------------------------------------------------------------*/
350 .globl memmove
351 .ent memmove
352
353memmove:
354 cmpu r4, r5, r6 /* n = s - d */
355 bgei r4,fast_memcpy_ascending
356
357fast_memcpy_descending:
358 /* move d to return register as value of function */
359 addi r3, r5, 0
360
361 add r5, r5, r7 /* d = d + c */
362 add r6, r6, r7 /* s = s + c */
363
364 addi r4, r0, 4 /* n = 4 */
365 cmpu r4, r4, r7 /* n = c - n (unsigned) */
366 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
367
368 /* transfer first 0~3 bytes to get aligned dest address */
369 andi r4, r5, 3 /* n = d & 3 */
370 /* if zero, destination already aligned */
371 beqi r4,d_dalign_done
372 rsub r7, r4, r7 /* c = c - n adjust c */
373
374d_xfer_first_loop:
375 /* if no bytes left to transfer, transfer the bulk */
376 beqi r4,d_dalign_done
377 addi r6, r6, -1 /* s-- */
378 addi r5, r5, -1 /* d-- */
379 lbui r11, r6, 0 /* h = *s */
380 sbi r11, r5, 0 /* *d = h */
381 brid d_xfer_first_loop /* loop */
382 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
383
384d_dalign_done:
385 addi r4, r0, 32 /* n = 32 */
386 cmpu r4, r4, r7 /* n = c - n (unsigned) */
387 /* if n < 0, less than one block to transfer */
388 blti r4, d_block_done
389
390d_block_xfer:
391 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
392 rsub r7, r4, r7 /* c = c - n */
393
394 andi r9, r6, 3 /* t1 = s & 3 */
395 /* if temp != 0, unaligned transfers needed */
396 bnei r9, d_block_unaligned
397
398d_block_aligned:
399 addi r6, r6, -32 /* s = s - 32 */
400 addi r5, r5, -32 /* d = d - 32 */
401 lwi r9, r6, 28 /* t1 = *(s + 28) */
402 lwi r10, r6, 24 /* t2 = *(s + 24) */
403 lwi r11, r6, 20 /* t3 = *(s + 20) */
404 lwi r12, r6, 16 /* t4 = *(s + 16) */
405 swi r9, r5, 28 /* *(d + 28) = t1 */
406 swi r10, r5, 24 /* *(d + 24) = t2 */
407 swi r11, r5, 20 /* *(d + 20) = t3 */
408 swi r12, r5, 16 /* *(d + 16) = t4 */
409 lwi r9, r6, 12 /* t1 = *(s + 12) */
410 lwi r10, r6, 8 /* t2 = *(s + 8) */
411 lwi r11, r6, 4 /* t3 = *(s + 4) */
412 lwi r12, r6, 0 /* t4 = *(s + 0) */
413 swi r9, r5, 12 /* *(d + 12) = t1 */
414 swi r10, r5, 8 /* *(d + 8) = t2 */
415 swi r11, r5, 4 /* *(d + 4) = t3 */
416 addi r4, r4, -32 /* n = n - 32 */
417 bneid r4, d_block_aligned /* while (n) loop */
418 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
419 bri d_block_done
420
421d_block_unaligned:
422 andi r8, r6, 0xfffffffc /* as = s & ~3 */
423 rsub r6, r4, r6 /* s = s - n */
424 lwi r11, r8, 0 /* h = *(as + 0) */
425
426 addi r9, r9, -1
427 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
428 addi r9, r9, -1
429 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
430
431d_block_u3:
432 bsrli r11, r11, 8 /* h = h >> 8 */
433d_bu3_loop:
434 addi r8, r8, -32 /* as = as - 32 */
435 addi r5, r5, -32 /* d = d - 32 */
436 lwi r12, r8, 28 /* v = *(as + 28) */
437 bslli r9, r12, 24 /* t1 = v << 24 */
438 or r9, r11, r9 /* t1 = h | t1 */
439 swi r9, r5, 28 /* *(d + 28) = t1 */
440 bsrli r11, r12, 8 /* h = v >> 8 */
441 lwi r12, r8, 24 /* v = *(as + 24) */
442 bslli r9, r12, 24 /* t1 = v << 24 */
443 or r9, r11, r9 /* t1 = h | t1 */
444 swi r9, r5, 24 /* *(d + 24) = t1 */
445 bsrli r11, r12, 8 /* h = v >> 8 */
446 lwi r12, r8, 20 /* v = *(as + 20) */
447 bslli r9, r12, 24 /* t1 = v << 24 */
448 or r9, r11, r9 /* t1 = h | t1 */
449 swi r9, r5, 20 /* *(d + 20) = t1 */
450 bsrli r11, r12, 8 /* h = v >> 8 */
451 lwi r12, r8, 16 /* v = *(as + 16) */
452 bslli r9, r12, 24 /* t1 = v << 24 */
453 or r9, r11, r9 /* t1 = h | t1 */
454 swi r9, r5, 16 /* *(d + 16) = t1 */
455 bsrli r11, r12, 8 /* h = v >> 8 */
456 lwi r12, r8, 12 /* v = *(as + 12) */
457 bslli r9, r12, 24 /* t1 = v << 24 */
458 or r9, r11, r9 /* t1 = h | t1 */
459 swi r9, r5, 12 /* *(d + 112) = t1 */
460 bsrli r11, r12, 8 /* h = v >> 8 */
461 lwi r12, r8, 8 /* v = *(as + 8) */
462 bslli r9, r12, 24 /* t1 = v << 24 */
463 or r9, r11, r9 /* t1 = h | t1 */
464 swi r9, r5, 8 /* *(d + 8) = t1 */
465 bsrli r11, r12, 8 /* h = v >> 8 */
466 lwi r12, r8, 4 /* v = *(as + 4) */
467 bslli r9, r12, 24 /* t1 = v << 24 */
468 or r9, r11, r9 /* t1 = h | t1 */
469 swi r9, r5, 4 /* *(d + 4) = t1 */
470 bsrli r11, r12, 8 /* h = v >> 8 */
471 lwi r12, r8, 0 /* v = *(as + 0) */
472 bslli r9, r12, 24 /* t1 = v << 24 */
473 or r9, r11, r9 /* t1 = h | t1 */
474 swi r9, r5, 0 /* *(d + 0) = t1 */
475 addi r4, r4, -32 /* n = n - 32 */
476 bneid r4, d_bu3_loop /* while (n) loop */
477 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
478 bri d_block_done
479
480d_block_u1:
481 bsrli r11, r11, 24 /* h = h >> 24 */
482d_bu1_loop:
483 addi r8, r8, -32 /* as = as - 32 */
484 addi r5, r5, -32 /* d = d - 32 */
485 lwi r12, r8, 28 /* v = *(as + 28) */
486 bslli r9, r12, 8 /* t1 = v << 8 */
487 or r9, r11, r9 /* t1 = h | t1 */
488 swi r9, r5, 28 /* *(d + 28) = t1 */
489 bsrli r11, r12, 24 /* h = v >> 24 */
490 lwi r12, r8, 24 /* v = *(as + 24) */
491 bslli r9, r12, 8 /* t1 = v << 8 */
492 or r9, r11, r9 /* t1 = h | t1 */
493 swi r9, r5, 24 /* *(d + 24) = t1 */
494 bsrli r11, r12, 24 /* h = v >> 24 */
495 lwi r12, r8, 20 /* v = *(as + 20) */
496 bslli r9, r12, 8 /* t1 = v << 8 */
497 or r9, r11, r9 /* t1 = h | t1 */
498 swi r9, r5, 20 /* *(d + 20) = t1 */
499 bsrli r11, r12, 24 /* h = v >> 24 */
500 lwi r12, r8, 16 /* v = *(as + 16) */
501 bslli r9, r12, 8 /* t1 = v << 8 */
502 or r9, r11, r9 /* t1 = h | t1 */
503 swi r9, r5, 16 /* *(d + 16) = t1 */
504 bsrli r11, r12, 24 /* h = v >> 24 */
505 lwi r12, r8, 12 /* v = *(as + 12) */
506 bslli r9, r12, 8 /* t1 = v << 8 */
507 or r9, r11, r9 /* t1 = h | t1 */
508 swi r9, r5, 12 /* *(d + 112) = t1 */
509 bsrli r11, r12, 24 /* h = v >> 24 */
510 lwi r12, r8, 8 /* v = *(as + 8) */
511 bslli r9, r12, 8 /* t1 = v << 8 */
512 or r9, r11, r9 /* t1 = h | t1 */
513 swi r9, r5, 8 /* *(d + 8) = t1 */
514 bsrli r11, r12, 24 /* h = v >> 24 */
515 lwi r12, r8, 4 /* v = *(as + 4) */
516 bslli r9, r12, 8 /* t1 = v << 8 */
517 or r9, r11, r9 /* t1 = h | t1 */
518 swi r9, r5, 4 /* *(d + 4) = t1 */
519 bsrli r11, r12, 24 /* h = v >> 24 */
520 lwi r12, r8, 0 /* v = *(as + 0) */
521 bslli r9, r12, 8 /* t1 = v << 8 */
522 or r9, r11, r9 /* t1 = h | t1 */
523 swi r9, r5, 0 /* *(d + 0) = t1 */
524 addi r4, r4, -32 /* n = n - 32 */
525 bneid r4, d_bu1_loop /* while (n) loop */
526 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
527 bri d_block_done
528
529d_block_u2:
530 bsrli r11, r11, 16 /* h = h >> 16 */
531d_bu2_loop:
532 addi r8, r8, -32 /* as = as - 32 */
533 addi r5, r5, -32 /* d = d - 32 */
534 lwi r12, r8, 28 /* v = *(as + 28) */
535 bslli r9, r12, 16 /* t1 = v << 16 */
536 or r9, r11, r9 /* t1 = h | t1 */
537 swi r9, r5, 28 /* *(d + 28) = t1 */
538 bsrli r11, r12, 16 /* h = v >> 16 */
539 lwi r12, r8, 24 /* v = *(as + 24) */
540 bslli r9, r12, 16 /* t1 = v << 16 */
541 or r9, r11, r9 /* t1 = h | t1 */
542 swi r9, r5, 24 /* *(d + 24) = t1 */
543 bsrli r11, r12, 16 /* h = v >> 16 */
544 lwi r12, r8, 20 /* v = *(as + 20) */
545 bslli r9, r12, 16 /* t1 = v << 16 */
546 or r9, r11, r9 /* t1 = h | t1 */
547 swi r9, r5, 20 /* *(d + 20) = t1 */
548 bsrli r11, r12, 16 /* h = v >> 16 */
549 lwi r12, r8, 16 /* v = *(as + 16) */
550 bslli r9, r12, 16 /* t1 = v << 16 */
551 or r9, r11, r9 /* t1 = h | t1 */
552 swi r9, r5, 16 /* *(d + 16) = t1 */
553 bsrli r11, r12, 16 /* h = v >> 16 */
554 lwi r12, r8, 12 /* v = *(as + 12) */
555 bslli r9, r12, 16 /* t1 = v << 16 */
556 or r9, r11, r9 /* t1 = h | t1 */
557 swi r9, r5, 12 /* *(d + 112) = t1 */
558 bsrli r11, r12, 16 /* h = v >> 16 */
559 lwi r12, r8, 8 /* v = *(as + 8) */
560 bslli r9, r12, 16 /* t1 = v << 16 */
561 or r9, r11, r9 /* t1 = h | t1 */
562 swi r9, r5, 8 /* *(d + 8) = t1 */
563 bsrli r11, r12, 16 /* h = v >> 16 */
564 lwi r12, r8, 4 /* v = *(as + 4) */
565 bslli r9, r12, 16 /* t1 = v << 16 */
566 or r9, r11, r9 /* t1 = h | t1 */
567 swi r9, r5, 4 /* *(d + 4) = t1 */
568 bsrli r11, r12, 16 /* h = v >> 16 */
569 lwi r12, r8, 0 /* v = *(as + 0) */
570 bslli r9, r12, 16 /* t1 = v << 16 */
571 or r9, r11, r9 /* t1 = h | t1 */
572 swi r9, r5, 0 /* *(d + 0) = t1 */
573 addi r4, r4, -32 /* n = n - 32 */
574 bneid r4, d_bu2_loop /* while (n) loop */
575 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
576
577d_block_done:
578 addi r4, r0, 4 /* n = 4 */
579 cmpu r4, r4, r7 /* n = c - n (unsigned) */
580 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
581
582d_word_xfer:
583 andi r4, r7, 0xfffffffc /* n = c & ~3 */
584 rsub r5, r4, r5 /* d = d - n */
585 rsub r6, r4, r6 /* s = s - n */
586 rsub r7, r4, r7 /* c = c - n */
587
588 andi r9, r6, 3 /* t1 = s & 3 */
589 /* if temp != 0, unaligned transfers needed */
590 bnei r9, d_word_unaligned
591
592d_word_aligned:
593 addi r4, r4,-4 /* n-- */
594 lw r9, r6, r4 /* t1 = *(s+n) */
595 bneid r4, d_word_aligned /* loop */
596 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
597
598 bri d_word_done
599
600d_word_unaligned:
601 andi r8, r6, 0xfffffffc /* as = s & ~3 */
602 lw r11, r8, r4 /* h = *(as + n) */
603
604 addi r9, r9, -1
605 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
606 addi r9, r9, -1
607 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
608
609d_word_u3:
610 bsrli r11, r11, 8 /* h = h >> 8 */
611d_wu3_loop:
612 addi r4, r4,-4 /* n = n - 4 */
613 lw r12, r8, r4 /* v = *(as + n) */
614 bslli r9, r12, 24 /* t1 = v << 24 */
615 or r9, r11, r9 /* t1 = h | t1 */
616 sw r9, r5, r4 /* *(d + n) = t1 */
617 bneid r4, d_wu3_loop /* while (n) loop */
618 bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
619
620 bri d_word_done
621
622d_word_u1:
623 bsrli r11, r11, 24 /* h = h >> 24 */
624d_wu1_loop:
625 addi r4, r4,-4 /* n = n - 4 */
626 lw r12, r8, r4 /* v = *(as + n) */
627 bslli r9, r12, 8 /* t1 = v << 8 */
628 or r9, r11, r9 /* t1 = h | t1 */
629 sw r9, r5, r4 /* *(d + n) = t1 */
630 bneid r4, d_wu1_loop /* while (n) loop */
631 bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
632
633 bri d_word_done
634
635d_word_u2:
636 bsrli r11, r11, 16 /* h = h >> 16 */
637d_wu2_loop:
638 addi r4, r4,-4 /* n = n - 4 */
639 lw r12, r8, r4 /* v = *(as + n) */
640 bslli r9, r12, 16 /* t1 = v << 16 */
641 or r9, r11, r9 /* t1 = h | t1 */
642 sw r9, r5, r4 /* *(d + n) = t1 */
643 bneid r4, d_wu2_loop /* while (n) loop */
644 bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
645
646d_word_done:
647
648d_xfer_end:
649d_xfer_end_loop:
650 beqi r7, a_done /* while (c) */
651 addi r6, r6, -1 /* s-- */
652 lbui r9, r6, 0 /* t1 = *s */
653 addi r5, r5, -1 /* d-- */
654 sbi r9, r5, 0 /* *d = t1 */
655 brid d_xfer_end_loop /* loop */
656 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
657
658d_done:
659 rtsd r15, 8
660 nop
661
662.end memmove
diff --git a/arch/microblaze/lib/memcpy.c b/arch/microblaze/lib/memcpy.c
new file mode 100644
index 000000000000..5880119c4487
--- /dev/null
+++ b/arch/microblaze/lib/memcpy.c
@@ -0,0 +1,161 @@
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2007 John Williams
5 *
6 * Reasonably optimised generic C-code for memcpy on Microblaze
7 * This is generic C code to do efficient, alignment-aware memcpy.
8 *
9 * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
10 * http://www.embedded.com/showArticle.jhtml?articleID=19205567
11 *
12 * Attempts were made, unsuccesfully, to contact the original
13 * author of this code (Michael Morrow, Intel). Below is the original
14 * copyright notice.
15 *
16 * This software has been developed by Intel Corporation.
17 * Intel specifically disclaims all warranties, express or
18 * implied, and all liability, including consequential and
19 * other indirect damages, for the use of this program, including
20 * liability for infringement of any proprietary rights,
21 * and including the warranties of merchantability and fitness
22 * for a particular purpose. Intel does not assume any
23 * responsibility for and errors which may appear in this program
24 * not any responsibility to update it.
25 */
26
27#include <linux/types.h>
28#include <linux/stddef.h>
29#include <linux/compiler.h>
30#include <linux/module.h>
31
32#include <linux/string.h>
33#include <asm/system.h>
34
35#ifdef __HAVE_ARCH_MEMCPY
36void *memcpy(void *v_dst, const void *v_src, __kernel_size_t c)
37{
38 const char *src = v_src;
39 char *dst = v_dst;
40#ifndef CONFIG_OPT_LIB_FUNCTION
41 /* Simple, byte oriented memcpy. */
42 while (c--)
43 *dst++ = *src++;
44
45 return v_dst;
46#else
47 /* The following code tries to optimize the copy by using unsigned
48 * alignment. This will work fine if both source and destination are
49 * aligned on the same boundary. However, if they are aligned on
50 * different boundaries shifts will be necessary. This might result in
51 * bad performance on MicroBlaze systems without a barrel shifter.
52 */
53 const uint32_t *i_src;
54 uint32_t *i_dst;
55
56 if (c >= 4) {
57 unsigned value, buf_hold;
58
59 /* Align the dstination to a word boundry. */
60 /* This is done in an endian independant manner. */
61 switch ((unsigned long)dst & 3) {
62 case 1:
63 *dst++ = *src++;
64 --c;
65 case 2:
66 *dst++ = *src++;
67 --c;
68 case 3:
69 *dst++ = *src++;
70 --c;
71 }
72
73 i_dst = (void *)dst;
74
75 /* Choose a copy scheme based on the source */
76 /* alignment relative to dstination. */
77 switch ((unsigned long)src & 3) {
78 case 0x0: /* Both byte offsets are aligned */
79 i_src = (const void *)src;
80
81 for (; c >= 4; c -= 4)
82 *i_dst++ = *i_src++;
83
84 src = (const void *)i_src;
85 break;
86 case 0x1: /* Unaligned - Off by 1 */
87 /* Word align the source */
88 i_src = (const void *) ((unsigned)src & ~3);
89
90 /* Load the holding buffer */
91 buf_hold = *i_src++ << 8;
92
93 for (; c >= 4; c -= 4) {
94 value = *i_src++;
95 *i_dst++ = buf_hold | value >> 24;
96 buf_hold = value << 8;
97 }
98
99 /* Realign the source */
100 src = (const void *)i_src;
101 src -= 3;
102 break;
103 case 0x2: /* Unaligned - Off by 2 */
104 /* Word align the source */
105 i_src = (const void *) ((unsigned)src & ~3);
106
107 /* Load the holding buffer */
108 buf_hold = *i_src++ << 16;
109
110 for (; c >= 4; c -= 4) {
111 value = *i_src++;
112 *i_dst++ = buf_hold | value >> 16;
113 buf_hold = value << 16;
114 }
115
116 /* Realign the source */
117 src = (const void *)i_src;
118 src -= 2;
119 break;
120 case 0x3: /* Unaligned - Off by 3 */
121 /* Word align the source */
122 i_src = (const void *) ((unsigned)src & ~3);
123
124 /* Load the holding buffer */
125 buf_hold = *i_src++ << 24;
126
127 for (; c >= 4; c -= 4) {
128 value = *i_src++;
129 *i_dst++ = buf_hold | value >> 8;
130 buf_hold = value << 24;
131 }
132
133 /* Realign the source */
134 src = (const void *)i_src;
135 src -= 1;
136 break;
137 }
138 dst = (void *)i_dst;
139 }
140
141 /* Finish off any remaining bytes */
142 /* simple fast copy, ... unless a cache boundry is crossed */
143 switch (c) {
144 case 3:
145 *dst++ = *src++;
146 case 2:
147 *dst++ = *src++;
148 case 1:
149 *dst++ = *src++;
150 }
151
152 return v_dst;
153#endif
154}
155EXPORT_SYMBOL(memcpy);
156#endif /* __HAVE_ARCH_MEMCPY */
157
158void *cacheable_memcpy(void *d, const void *s, __kernel_size_t c)
159{
160 return memcpy(d, s, c);
161}
diff --git a/arch/microblaze/lib/memmove.c b/arch/microblaze/lib/memmove.c
new file mode 100644
index 000000000000..d4e9f49a71f7
--- /dev/null
+++ b/arch/microblaze/lib/memmove.c
@@ -0,0 +1,175 @@
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2007 John Williams
5 *
6 * Reasonably optimised generic C-code for memcpy on Microblaze
7 * This is generic C code to do efficient, alignment-aware memmove.
8 *
9 * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
10 * http://www.embedded.com/showArticle.jhtml?articleID=19205567
11 *
12 * Attempts were made, unsuccesfully, to contact the original
13 * author of this code (Michael Morrow, Intel). Below is the original
14 * copyright notice.
15 *
16 * This software has been developed by Intel Corporation.
17 * Intel specifically disclaims all warranties, express or
18 * implied, and all liability, including consequential and
19 * other indirect damages, for the use of this program, including
20 * liability for infringement of any proprietary rights,
21 * and including the warranties of merchantability and fitness
22 * for a particular purpose. Intel does not assume any
23 * responsibility for and errors which may appear in this program
24 * not any responsibility to update it.
25 */
26
27#include <linux/types.h>
28#include <linux/stddef.h>
29#include <linux/compiler.h>
30#include <linux/module.h>
31#include <linux/string.h>
32
33#ifdef __HAVE_ARCH_MEMMOVE
34void *memmove(void *v_dst, const void *v_src, __kernel_size_t c)
35{
36 const char *src = v_src;
37 char *dst = v_dst;
38
39#ifdef CONFIG_OPT_LIB_FUNCTION
40 const uint32_t *i_src;
41 uint32_t *i_dst;
42#endif
43
44 if (!c)
45 return v_dst;
46
47 /* Use memcpy when source is higher than dest */
48 if (v_dst <= v_src)
49 return memcpy(v_dst, v_src, c);
50
51#ifndef CONFIG_OPT_LIB_FUNCTION
52 /* copy backwards, from end to beginning */
53 src += c;
54 dst += c;
55
56 /* Simple, byte oriented memmove. */
57 while (c--)
58 *--dst = *--src;
59
60 return v_dst;
61#else
62 /* The following code tries to optimize the copy by using unsigned
63 * alignment. This will work fine if both source and destination are
64 * aligned on the same boundary. However, if they are aligned on
65 * different boundaries shifts will be necessary. This might result in
66 * bad performance on MicroBlaze systems without a barrel shifter.
67 */
68 /* FIXME this part needs more test */
69 /* Do a descending copy - this is a bit trickier! */
70 dst += c;
71 src += c;
72
73 if (c >= 4) {
74 unsigned value, buf_hold;
75
76 /* Align the destination to a word boundry. */
77 /* This is done in an endian independant manner. */
78
79 switch ((unsigned long)dst & 3) {
80 case 3:
81 *--dst = *--src;
82 --c;
83 case 2:
84 *--dst = *--src;
85 --c;
86 case 1:
87 *--dst = *--src;
88 --c;
89 }
90
91 i_dst = (void *)dst;
92 /* Choose a copy scheme based on the source */
93 /* alignment relative to dstination. */
94 switch ((unsigned long)src & 3) {
95 case 0x0: /* Both byte offsets are aligned */
96
97 i_src = (const void *)src;
98
99 for (; c >= 4; c -= 4)
100 *--i_dst = *--i_src;
101
102 src = (const void *)i_src;
103 break;
104 case 0x1: /* Unaligned - Off by 1 */
105 /* Word align the source */
106 i_src = (const void *) (((unsigned)src + 4) & ~3);
107
108 /* Load the holding buffer */
109 buf_hold = *--i_src >> 24;
110
111 for (; c >= 4; c -= 4) {
112 value = *--i_src;
113 *--i_dst = buf_hold << 8 | value;
114 buf_hold = value >> 24;
115 }
116
117 /* Realign the source */
118 src = (const void *)i_src;
119 src += 1;
120 break;
121 case 0x2: /* Unaligned - Off by 2 */
122 /* Word align the source */
123 i_src = (const void *) (((unsigned)src + 4) & ~3);
124
125 /* Load the holding buffer */
126 buf_hold = *--i_src >> 16;
127
128 for (; c >= 4; c -= 4) {
129 value = *--i_src;
130 *--i_dst = buf_hold << 16 | value;
131 buf_hold = value >> 16;
132 }
133
134 /* Realign the source */
135 src = (const void *)i_src;
136 src += 2;
137 break;
138 case 0x3: /* Unaligned - Off by 3 */
139 /* Word align the source */
140 i_src = (const void *) (((unsigned)src + 4) & ~3);
141
142 /* Load the holding buffer */
143 buf_hold = *--i_src >> 8;
144
145 for (; c >= 4; c -= 4) {
146 value = *--i_src;
147 *--i_dst = buf_hold << 24 | value;
148 buf_hold = value >> 8;
149 }
150
151 /* Realign the source */
152 src = (const void *)i_src;
153 src += 3;
154 break;
155 }
156 dst = (void *)i_dst;
157 }
158
159 /* simple fast copy, ... unless a cache boundry is crossed */
160 /* Finish off any remaining bytes */
161 switch (c) {
162 case 4:
163 *--dst = *--src;
164 case 3:
165 *--dst = *--src;
166 case 2:
167 *--dst = *--src;
168 case 1:
169 *--dst = *--src;
170 }
171 return v_dst;
172#endif
173}
174EXPORT_SYMBOL(memmove);
175#endif /* __HAVE_ARCH_MEMMOVE */
diff --git a/arch/microblaze/lib/memset.c b/arch/microblaze/lib/memset.c
new file mode 100644
index 000000000000..941dc8f94b03
--- /dev/null
+++ b/arch/microblaze/lib/memset.c
@@ -0,0 +1,82 @@
1/*
2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2007 John Williams
5 *
6 * Reasonably optimised generic C-code for memset on Microblaze
7 * This is generic C code to do efficient, alignment-aware memcpy.
8 *
9 * It is based on demo code originally Copyright 2001 by Intel Corp, taken from
10 * http://www.embedded.com/showArticle.jhtml?articleID=19205567
11 *
12 * Attempts were made, unsuccesfully, to contact the original
13 * author of this code (Michael Morrow, Intel). Below is the original
14 * copyright notice.
15 *
16 * This software has been developed by Intel Corporation.
17 * Intel specifically disclaims all warranties, express or
18 * implied, and all liability, including consequential and
19 * other indirect damages, for the use of this program, including
20 * liability for infringement of any proprietary rights,
21 * and including the warranties of merchantability and fitness
22 * for a particular purpose. Intel does not assume any
23 * responsibility for and errors which may appear in this program
24 * not any responsibility to update it.
25 */
26
27#include <linux/types.h>
28#include <linux/stddef.h>
29#include <linux/compiler.h>
30#include <linux/module.h>
31#include <linux/string.h>
32
33#ifdef __HAVE_ARCH_MEMSET
34void *memset(void *v_src, int c, __kernel_size_t n)
35{
36
37 char *src = v_src;
38#ifdef CONFIG_OPT_LIB_FUNCTION
39 uint32_t *i_src;
40 uint32_t w32;
41#endif
42 /* Truncate c to 8 bits */
43 c = (c & 0xFF);
44
45#ifdef CONFIG_OPT_LIB_FUNCTION
46 /* Make a repeating word out of it */
47 w32 = c;
48 w32 |= w32 << 8;
49 w32 |= w32 << 16;
50
51 if (n >= 4) {
52 /* Align the destination to a word boundary */
53 /* This is done in an endian independant manner */
54 switch ((unsigned) src & 3) {
55 case 1:
56 *src++ = c;
57 --n;
58 case 2:
59 *src++ = c;
60 --n;
61 case 3:
62 *src++ = c;
63 --n;
64 }
65
66 i_src = (void *)src;
67
68 /* Do as many full-word copies as we can */
69 for (; n >= 4; n -= 4)
70 *i_src++ = w32;
71
72 src = (void *)i_src;
73 }
74#endif
75 /* Simple, byte oriented memset or the rest of count. */
76 while (n--)
77 *src++ = c;
78
79 return v_src;
80}
81EXPORT_SYMBOL(memset);
82#endif /* __HAVE_ARCH_MEMSET */