diff options
author | Michal Simek <monstr@monstr.eu> | 2009-03-27 09:25:21 -0400 |
---|---|---|
committer | Michal Simek <monstr@monstr.eu> | 2009-03-27 09:25:21 -0400 |
commit | 322ae8eb91c1730728400c5b8dd1108aef1205b8 (patch) | |
tree | d5c74be5b85ba2938f98172e72727b31fc214eaa /arch/microblaze | |
parent | 16bfeaf23ead78d937b3eacfb5c7cdc7bff6d3da (diff) |
microblaze_v8: supported function for memory - kernel/lib
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Stephen Neuendorffer <stephen.neuendorffer@xilinx.com>
Acked-by: John Linn <john.linn@xilinx.com>
Acked-by: John Williams <john.williams@petalogix.com>
Signed-off-by: Michal Simek <monstr@monstr.eu>
Diffstat (limited to 'arch/microblaze')
-rw-r--r-- | arch/microblaze/lib/fastcopy.S | 662 | ||||
-rw-r--r-- | arch/microblaze/lib/memcpy.c | 161 | ||||
-rw-r--r-- | arch/microblaze/lib/memmove.c | 175 | ||||
-rw-r--r-- | arch/microblaze/lib/memset.c | 82 |
4 files changed, 1080 insertions, 0 deletions
diff --git a/arch/microblaze/lib/fastcopy.S b/arch/microblaze/lib/fastcopy.S new file mode 100644 index 00000000000..02e3ab4eddf --- /dev/null +++ b/arch/microblaze/lib/fastcopy.S | |||
@@ -0,0 +1,662 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> | ||
3 | * Copyright (C) 2008-2009 PetaLogix | ||
4 | * Copyright (C) 2008 Jim Law - Iris LP All rights reserved. | ||
5 | * | ||
6 | * This file is subject to the terms and conditions of the GNU General | ||
7 | * Public License. See the file COPYING in the main directory of this | ||
8 | * archive for more details. | ||
9 | * | ||
10 | * Written by Jim Law <jlaw@irispower.com> | ||
11 | * | ||
12 | * intended to replace: | ||
13 | * memcpy in memcpy.c and | ||
14 | * memmove in memmove.c | ||
15 | * ... in arch/microblaze/lib | ||
16 | * | ||
17 | * | ||
18 | * assly_fastcopy.S | ||
19 | * | ||
20 | * Attempt at quicker memcpy and memmove for MicroBlaze | ||
21 | * Input : Operand1 in Reg r5 - destination address | ||
22 | * Operand2 in Reg r6 - source address | ||
23 | * Operand3 in Reg r7 - number of bytes to transfer | ||
24 | * Output: Result in Reg r3 - starting destinaition address | ||
25 | * | ||
26 | * | ||
27 | * Explanation: | ||
28 | * Perform (possibly unaligned) copy of a block of memory | ||
29 | * between mem locations with size of xfer spec'd in bytes | ||
30 | */ | ||
31 | |||
32 | #include <linux/linkage.h> | ||
33 | |||
34 | .globl memcpy | ||
35 | .ent memcpy | ||
36 | |||
37 | memcpy: | ||
38 | fast_memcpy_ascending: | ||
39 | /* move d to return register as value of function */ | ||
40 | addi r3, r5, 0 | ||
41 | |||
42 | addi r4, r0, 4 /* n = 4 */ | ||
43 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
44 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ | ||
45 | |||
46 | /* transfer first 0~3 bytes to get aligned dest address */ | ||
47 | andi r4, r5, 3 /* n = d & 3 */ | ||
48 | /* if zero, destination already aligned */ | ||
49 | beqi r4, a_dalign_done | ||
50 | /* n = 4 - n (yields 3, 2, 1 transfers for 1, 2, 3 addr offset) */ | ||
51 | rsubi r4, r4, 4 | ||
52 | rsub r7, r4, r7 /* c = c - n adjust c */ | ||
53 | |||
54 | a_xfer_first_loop: | ||
55 | /* if no bytes left to transfer, transfer the bulk */ | ||
56 | beqi r4, a_dalign_done | ||
57 | lbui r11, r6, 0 /* h = *s */ | ||
58 | sbi r11, r5, 0 /* *d = h */ | ||
59 | addi r6, r6, 1 /* s++ */ | ||
60 | addi r5, r5, 1 /* d++ */ | ||
61 | brid a_xfer_first_loop /* loop */ | ||
62 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ | ||
63 | |||
64 | a_dalign_done: | ||
65 | addi r4, r0, 32 /* n = 32 */ | ||
66 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
67 | /* if n < 0, less than one block to transfer */ | ||
68 | blti r4, a_block_done | ||
69 | |||
70 | a_block_xfer: | ||
71 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ | ||
72 | rsub r7, r4, r7 /* c = c - n */ | ||
73 | |||
74 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
75 | /* if temp != 0, unaligned transfers needed */ | ||
76 | bnei r9, a_block_unaligned | ||
77 | |||
78 | a_block_aligned: | ||
79 | lwi r9, r6, 0 /* t1 = *(s + 0) */ | ||
80 | lwi r10, r6, 4 /* t2 = *(s + 4) */ | ||
81 | lwi r11, r6, 8 /* t3 = *(s + 8) */ | ||
82 | lwi r12, r6, 12 /* t4 = *(s + 12) */ | ||
83 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
84 | swi r10, r5, 4 /* *(d + 4) = t2 */ | ||
85 | swi r11, r5, 8 /* *(d + 8) = t3 */ | ||
86 | swi r12, r5, 12 /* *(d + 12) = t4 */ | ||
87 | lwi r9, r6, 16 /* t1 = *(s + 16) */ | ||
88 | lwi r10, r6, 20 /* t2 = *(s + 20) */ | ||
89 | lwi r11, r6, 24 /* t3 = *(s + 24) */ | ||
90 | lwi r12, r6, 28 /* t4 = *(s + 28) */ | ||
91 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
92 | swi r10, r5, 20 /* *(d + 20) = t2 */ | ||
93 | swi r11, r5, 24 /* *(d + 24) = t3 */ | ||
94 | swi r12, r5, 28 /* *(d + 28) = t4 */ | ||
95 | addi r6, r6, 32 /* s = s + 32 */ | ||
96 | addi r4, r4, -32 /* n = n - 32 */ | ||
97 | bneid r4, a_block_aligned /* while (n) loop */ | ||
98 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
99 | bri a_block_done | ||
100 | |||
101 | a_block_unaligned: | ||
102 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
103 | add r6, r6, r4 /* s = s + n */ | ||
104 | lwi r11, r8, 0 /* h = *(as + 0) */ | ||
105 | |||
106 | addi r9, r9, -1 | ||
107 | beqi r9, a_block_u1 /* t1 was 1 => 1 byte offset */ | ||
108 | addi r9, r9, -1 | ||
109 | beqi r9, a_block_u2 /* t1 was 2 => 2 byte offset */ | ||
110 | |||
111 | a_block_u3: | ||
112 | bslli r11, r11, 24 /* h = h << 24 */ | ||
113 | a_bu3_loop: | ||
114 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
115 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
116 | or r9, r11, r9 /* t1 = h | t1 */ | ||
117 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
118 | bslli r11, r12, 24 /* h = v << 24 */ | ||
119 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
120 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
121 | or r9, r11, r9 /* t1 = h | t1 */ | ||
122 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
123 | bslli r11, r12, 24 /* h = v << 24 */ | ||
124 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
125 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
126 | or r9, r11, r9 /* t1 = h | t1 */ | ||
127 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
128 | bslli r11, r12, 24 /* h = v << 24 */ | ||
129 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
130 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
131 | or r9, r11, r9 /* t1 = h | t1 */ | ||
132 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
133 | bslli r11, r12, 24 /* h = v << 24 */ | ||
134 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
135 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
136 | or r9, r11, r9 /* t1 = h | t1 */ | ||
137 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
138 | bslli r11, r12, 24 /* h = v << 24 */ | ||
139 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
140 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
141 | or r9, r11, r9 /* t1 = h | t1 */ | ||
142 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
143 | bslli r11, r12, 24 /* h = v << 24 */ | ||
144 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
145 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
146 | or r9, r11, r9 /* t1 = h | t1 */ | ||
147 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
148 | bslli r11, r12, 24 /* h = v << 24 */ | ||
149 | lwi r12, r8, 32 /* v = *(as + 32) */ | ||
150 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
151 | or r9, r11, r9 /* t1 = h | t1 */ | ||
152 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
153 | bslli r11, r12, 24 /* h = v << 24 */ | ||
154 | addi r8, r8, 32 /* as = as + 32 */ | ||
155 | addi r4, r4, -32 /* n = n - 32 */ | ||
156 | bneid r4, a_bu3_loop /* while (n) loop */ | ||
157 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
158 | bri a_block_done | ||
159 | |||
160 | a_block_u1: | ||
161 | bslli r11, r11, 8 /* h = h << 8 */ | ||
162 | a_bu1_loop: | ||
163 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
164 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
165 | or r9, r11, r9 /* t1 = h | t1 */ | ||
166 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
167 | bslli r11, r12, 8 /* h = v << 8 */ | ||
168 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
169 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
170 | or r9, r11, r9 /* t1 = h | t1 */ | ||
171 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
172 | bslli r11, r12, 8 /* h = v << 8 */ | ||
173 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
174 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
175 | or r9, r11, r9 /* t1 = h | t1 */ | ||
176 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
177 | bslli r11, r12, 8 /* h = v << 8 */ | ||
178 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
179 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
180 | or r9, r11, r9 /* t1 = h | t1 */ | ||
181 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
182 | bslli r11, r12, 8 /* h = v << 8 */ | ||
183 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
184 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
185 | or r9, r11, r9 /* t1 = h | t1 */ | ||
186 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
187 | bslli r11, r12, 8 /* h = v << 8 */ | ||
188 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
189 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
190 | or r9, r11, r9 /* t1 = h | t1 */ | ||
191 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
192 | bslli r11, r12, 8 /* h = v << 8 */ | ||
193 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
194 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
195 | or r9, r11, r9 /* t1 = h | t1 */ | ||
196 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
197 | bslli r11, r12, 8 /* h = v << 8 */ | ||
198 | lwi r12, r8, 32 /* v = *(as + 32) */ | ||
199 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
200 | or r9, r11, r9 /* t1 = h | t1 */ | ||
201 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
202 | bslli r11, r12, 8 /* h = v << 8 */ | ||
203 | addi r8, r8, 32 /* as = as + 32 */ | ||
204 | addi r4, r4, -32 /* n = n - 32 */ | ||
205 | bneid r4, a_bu1_loop /* while (n) loop */ | ||
206 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
207 | bri a_block_done | ||
208 | |||
209 | a_block_u2: | ||
210 | bslli r11, r11, 16 /* h = h << 16 */ | ||
211 | a_bu2_loop: | ||
212 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
213 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
214 | or r9, r11, r9 /* t1 = h | t1 */ | ||
215 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
216 | bslli r11, r12, 16 /* h = v << 16 */ | ||
217 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
218 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
219 | or r9, r11, r9 /* t1 = h | t1 */ | ||
220 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
221 | bslli r11, r12, 16 /* h = v << 16 */ | ||
222 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
223 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
224 | or r9, r11, r9 /* t1 = h | t1 */ | ||
225 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
226 | bslli r11, r12, 16 /* h = v << 16 */ | ||
227 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
228 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
229 | or r9, r11, r9 /* t1 = h | t1 */ | ||
230 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
231 | bslli r11, r12, 16 /* h = v << 16 */ | ||
232 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
233 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
234 | or r9, r11, r9 /* t1 = h | t1 */ | ||
235 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
236 | bslli r11, r12, 16 /* h = v << 16 */ | ||
237 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
238 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
239 | or r9, r11, r9 /* t1 = h | t1 */ | ||
240 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
241 | bslli r11, r12, 16 /* h = v << 16 */ | ||
242 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
243 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
244 | or r9, r11, r9 /* t1 = h | t1 */ | ||
245 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
246 | bslli r11, r12, 16 /* h = v << 16 */ | ||
247 | lwi r12, r8, 32 /* v = *(as + 32) */ | ||
248 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
249 | or r9, r11, r9 /* t1 = h | t1 */ | ||
250 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
251 | bslli r11, r12, 16 /* h = v << 16 */ | ||
252 | addi r8, r8, 32 /* as = as + 32 */ | ||
253 | addi r4, r4, -32 /* n = n - 32 */ | ||
254 | bneid r4, a_bu2_loop /* while (n) loop */ | ||
255 | addi r5, r5, 32 /* d = d + 32 (IN DELAY SLOT) */ | ||
256 | |||
257 | a_block_done: | ||
258 | addi r4, r0, 4 /* n = 4 */ | ||
259 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
260 | blti r4, a_xfer_end /* if n < 0, less than one word to transfer */ | ||
261 | |||
262 | a_word_xfer: | ||
263 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ | ||
264 | addi r10, r0, 0 /* offset = 0 */ | ||
265 | |||
266 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
267 | /* if temp != 0, unaligned transfers needed */ | ||
268 | bnei r9, a_word_unaligned | ||
269 | |||
270 | a_word_aligned: | ||
271 | lw r9, r6, r10 /* t1 = *(s+offset) */ | ||
272 | sw r9, r5, r10 /* *(d+offset) = t1 */ | ||
273 | addi r4, r4,-4 /* n-- */ | ||
274 | bneid r4, a_word_aligned /* loop */ | ||
275 | addi r10, r10, 4 /* offset++ (IN DELAY SLOT) */ | ||
276 | |||
277 | bri a_word_done | ||
278 | |||
279 | a_word_unaligned: | ||
280 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
281 | lwi r11, r8, 0 /* h = *(as + 0) */ | ||
282 | addi r8, r8, 4 /* as = as + 4 */ | ||
283 | |||
284 | addi r9, r9, -1 | ||
285 | beqi r9, a_word_u1 /* t1 was 1 => 1 byte offset */ | ||
286 | addi r9, r9, -1 | ||
287 | beqi r9, a_word_u2 /* t1 was 2 => 2 byte offset */ | ||
288 | |||
289 | a_word_u3: | ||
290 | bslli r11, r11, 24 /* h = h << 24 */ | ||
291 | a_wu3_loop: | ||
292 | lw r12, r8, r10 /* v = *(as + offset) */ | ||
293 | bsrli r9, r12, 8 /* t1 = v >> 8 */ | ||
294 | or r9, r11, r9 /* t1 = h | t1 */ | ||
295 | sw r9, r5, r10 /* *(d + offset) = t1 */ | ||
296 | bslli r11, r12, 24 /* h = v << 24 */ | ||
297 | addi r4, r4,-4 /* n = n - 4 */ | ||
298 | bneid r4, a_wu3_loop /* while (n) loop */ | ||
299 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | ||
300 | |||
301 | bri a_word_done | ||
302 | |||
303 | a_word_u1: | ||
304 | bslli r11, r11, 8 /* h = h << 8 */ | ||
305 | a_wu1_loop: | ||
306 | lw r12, r8, r10 /* v = *(as + offset) */ | ||
307 | bsrli r9, r12, 24 /* t1 = v >> 24 */ | ||
308 | or r9, r11, r9 /* t1 = h | t1 */ | ||
309 | sw r9, r5, r10 /* *(d + offset) = t1 */ | ||
310 | bslli r11, r12, 8 /* h = v << 8 */ | ||
311 | addi r4, r4,-4 /* n = n - 4 */ | ||
312 | bneid r4, a_wu1_loop /* while (n) loop */ | ||
313 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | ||
314 | |||
315 | bri a_word_done | ||
316 | |||
317 | a_word_u2: | ||
318 | bslli r11, r11, 16 /* h = h << 16 */ | ||
319 | a_wu2_loop: | ||
320 | lw r12, r8, r10 /* v = *(as + offset) */ | ||
321 | bsrli r9, r12, 16 /* t1 = v >> 16 */ | ||
322 | or r9, r11, r9 /* t1 = h | t1 */ | ||
323 | sw r9, r5, r10 /* *(d + offset) = t1 */ | ||
324 | bslli r11, r12, 16 /* h = v << 16 */ | ||
325 | addi r4, r4,-4 /* n = n - 4 */ | ||
326 | bneid r4, a_wu2_loop /* while (n) loop */ | ||
327 | addi r10, r10, 4 /* offset = ofset + 4 (IN DELAY SLOT) */ | ||
328 | |||
329 | a_word_done: | ||
330 | add r5, r5, r10 /* d = d + offset */ | ||
331 | add r6, r6, r10 /* s = s + offset */ | ||
332 | rsub r7, r10, r7 /* c = c - offset */ | ||
333 | |||
334 | a_xfer_end: | ||
335 | a_xfer_end_loop: | ||
336 | beqi r7, a_done /* while (c) */ | ||
337 | lbui r9, r6, 0 /* t1 = *s */ | ||
338 | addi r6, r6, 1 /* s++ */ | ||
339 | sbi r9, r5, 0 /* *d = t1 */ | ||
340 | addi r7, r7, -1 /* c-- */ | ||
341 | brid a_xfer_end_loop /* loop */ | ||
342 | addi r5, r5, 1 /* d++ (IN DELAY SLOT) */ | ||
343 | |||
344 | a_done: | ||
345 | rtsd r15, 8 | ||
346 | nop | ||
347 | |||
348 | .end memcpy | ||
349 | /*----------------------------------------------------------------------------*/ | ||
350 | .globl memmove | ||
351 | .ent memmove | ||
352 | |||
353 | memmove: | ||
354 | cmpu r4, r5, r6 /* n = s - d */ | ||
355 | bgei r4,fast_memcpy_ascending | ||
356 | |||
357 | fast_memcpy_descending: | ||
358 | /* move d to return register as value of function */ | ||
359 | addi r3, r5, 0 | ||
360 | |||
361 | add r5, r5, r7 /* d = d + c */ | ||
362 | add r6, r6, r7 /* s = s + c */ | ||
363 | |||
364 | addi r4, r0, 4 /* n = 4 */ | ||
365 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
366 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ | ||
367 | |||
368 | /* transfer first 0~3 bytes to get aligned dest address */ | ||
369 | andi r4, r5, 3 /* n = d & 3 */ | ||
370 | /* if zero, destination already aligned */ | ||
371 | beqi r4,d_dalign_done | ||
372 | rsub r7, r4, r7 /* c = c - n adjust c */ | ||
373 | |||
374 | d_xfer_first_loop: | ||
375 | /* if no bytes left to transfer, transfer the bulk */ | ||
376 | beqi r4,d_dalign_done | ||
377 | addi r6, r6, -1 /* s-- */ | ||
378 | addi r5, r5, -1 /* d-- */ | ||
379 | lbui r11, r6, 0 /* h = *s */ | ||
380 | sbi r11, r5, 0 /* *d = h */ | ||
381 | brid d_xfer_first_loop /* loop */ | ||
382 | addi r4, r4, -1 /* n-- (IN DELAY SLOT) */ | ||
383 | |||
384 | d_dalign_done: | ||
385 | addi r4, r0, 32 /* n = 32 */ | ||
386 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
387 | /* if n < 0, less than one block to transfer */ | ||
388 | blti r4, d_block_done | ||
389 | |||
390 | d_block_xfer: | ||
391 | andi r4, r7, 0xffffffe0 /* n = c & ~31 */ | ||
392 | rsub r7, r4, r7 /* c = c - n */ | ||
393 | |||
394 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
395 | /* if temp != 0, unaligned transfers needed */ | ||
396 | bnei r9, d_block_unaligned | ||
397 | |||
398 | d_block_aligned: | ||
399 | addi r6, r6, -32 /* s = s - 32 */ | ||
400 | addi r5, r5, -32 /* d = d - 32 */ | ||
401 | lwi r9, r6, 28 /* t1 = *(s + 28) */ | ||
402 | lwi r10, r6, 24 /* t2 = *(s + 24) */ | ||
403 | lwi r11, r6, 20 /* t3 = *(s + 20) */ | ||
404 | lwi r12, r6, 16 /* t4 = *(s + 16) */ | ||
405 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
406 | swi r10, r5, 24 /* *(d + 24) = t2 */ | ||
407 | swi r11, r5, 20 /* *(d + 20) = t3 */ | ||
408 | swi r12, r5, 16 /* *(d + 16) = t4 */ | ||
409 | lwi r9, r6, 12 /* t1 = *(s + 12) */ | ||
410 | lwi r10, r6, 8 /* t2 = *(s + 8) */ | ||
411 | lwi r11, r6, 4 /* t3 = *(s + 4) */ | ||
412 | lwi r12, r6, 0 /* t4 = *(s + 0) */ | ||
413 | swi r9, r5, 12 /* *(d + 12) = t1 */ | ||
414 | swi r10, r5, 8 /* *(d + 8) = t2 */ | ||
415 | swi r11, r5, 4 /* *(d + 4) = t3 */ | ||
416 | addi r4, r4, -32 /* n = n - 32 */ | ||
417 | bneid r4, d_block_aligned /* while (n) loop */ | ||
418 | swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */ | ||
419 | bri d_block_done | ||
420 | |||
421 | d_block_unaligned: | ||
422 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
423 | rsub r6, r4, r6 /* s = s - n */ | ||
424 | lwi r11, r8, 0 /* h = *(as + 0) */ | ||
425 | |||
426 | addi r9, r9, -1 | ||
427 | beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */ | ||
428 | addi r9, r9, -1 | ||
429 | beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */ | ||
430 | |||
431 | d_block_u3: | ||
432 | bsrli r11, r11, 8 /* h = h >> 8 */ | ||
433 | d_bu3_loop: | ||
434 | addi r8, r8, -32 /* as = as - 32 */ | ||
435 | addi r5, r5, -32 /* d = d - 32 */ | ||
436 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
437 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
438 | or r9, r11, r9 /* t1 = h | t1 */ | ||
439 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
440 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
441 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
442 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
443 | or r9, r11, r9 /* t1 = h | t1 */ | ||
444 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
445 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
446 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
447 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
448 | or r9, r11, r9 /* t1 = h | t1 */ | ||
449 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
450 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
451 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
452 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
453 | or r9, r11, r9 /* t1 = h | t1 */ | ||
454 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
455 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
456 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
457 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
458 | or r9, r11, r9 /* t1 = h | t1 */ | ||
459 | swi r9, r5, 12 /* *(d + 112) = t1 */ | ||
460 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
461 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
462 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
463 | or r9, r11, r9 /* t1 = h | t1 */ | ||
464 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
465 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
466 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
467 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
468 | or r9, r11, r9 /* t1 = h | t1 */ | ||
469 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
470 | bsrli r11, r12, 8 /* h = v >> 8 */ | ||
471 | lwi r12, r8, 0 /* v = *(as + 0) */ | ||
472 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
473 | or r9, r11, r9 /* t1 = h | t1 */ | ||
474 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
475 | addi r4, r4, -32 /* n = n - 32 */ | ||
476 | bneid r4, d_bu3_loop /* while (n) loop */ | ||
477 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ | ||
478 | bri d_block_done | ||
479 | |||
480 | d_block_u1: | ||
481 | bsrli r11, r11, 24 /* h = h >> 24 */ | ||
482 | d_bu1_loop: | ||
483 | addi r8, r8, -32 /* as = as - 32 */ | ||
484 | addi r5, r5, -32 /* d = d - 32 */ | ||
485 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
486 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
487 | or r9, r11, r9 /* t1 = h | t1 */ | ||
488 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
489 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
490 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
491 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
492 | or r9, r11, r9 /* t1 = h | t1 */ | ||
493 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
494 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
495 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
496 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
497 | or r9, r11, r9 /* t1 = h | t1 */ | ||
498 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
499 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
500 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
501 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
502 | or r9, r11, r9 /* t1 = h | t1 */ | ||
503 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
504 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
505 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
506 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
507 | or r9, r11, r9 /* t1 = h | t1 */ | ||
508 | swi r9, r5, 12 /* *(d + 112) = t1 */ | ||
509 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
510 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
511 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
512 | or r9, r11, r9 /* t1 = h | t1 */ | ||
513 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
514 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
515 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
516 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
517 | or r9, r11, r9 /* t1 = h | t1 */ | ||
518 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
519 | bsrli r11, r12, 24 /* h = v >> 24 */ | ||
520 | lwi r12, r8, 0 /* v = *(as + 0) */ | ||
521 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
522 | or r9, r11, r9 /* t1 = h | t1 */ | ||
523 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
524 | addi r4, r4, -32 /* n = n - 32 */ | ||
525 | bneid r4, d_bu1_loop /* while (n) loop */ | ||
526 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ | ||
527 | bri d_block_done | ||
528 | |||
529 | d_block_u2: | ||
530 | bsrli r11, r11, 16 /* h = h >> 16 */ | ||
531 | d_bu2_loop: | ||
532 | addi r8, r8, -32 /* as = as - 32 */ | ||
533 | addi r5, r5, -32 /* d = d - 32 */ | ||
534 | lwi r12, r8, 28 /* v = *(as + 28) */ | ||
535 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
536 | or r9, r11, r9 /* t1 = h | t1 */ | ||
537 | swi r9, r5, 28 /* *(d + 28) = t1 */ | ||
538 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
539 | lwi r12, r8, 24 /* v = *(as + 24) */ | ||
540 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
541 | or r9, r11, r9 /* t1 = h | t1 */ | ||
542 | swi r9, r5, 24 /* *(d + 24) = t1 */ | ||
543 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
544 | lwi r12, r8, 20 /* v = *(as + 20) */ | ||
545 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
546 | or r9, r11, r9 /* t1 = h | t1 */ | ||
547 | swi r9, r5, 20 /* *(d + 20) = t1 */ | ||
548 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
549 | lwi r12, r8, 16 /* v = *(as + 16) */ | ||
550 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
551 | or r9, r11, r9 /* t1 = h | t1 */ | ||
552 | swi r9, r5, 16 /* *(d + 16) = t1 */ | ||
553 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
554 | lwi r12, r8, 12 /* v = *(as + 12) */ | ||
555 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
556 | or r9, r11, r9 /* t1 = h | t1 */ | ||
557 | swi r9, r5, 12 /* *(d + 112) = t1 */ | ||
558 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
559 | lwi r12, r8, 8 /* v = *(as + 8) */ | ||
560 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
561 | or r9, r11, r9 /* t1 = h | t1 */ | ||
562 | swi r9, r5, 8 /* *(d + 8) = t1 */ | ||
563 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
564 | lwi r12, r8, 4 /* v = *(as + 4) */ | ||
565 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
566 | or r9, r11, r9 /* t1 = h | t1 */ | ||
567 | swi r9, r5, 4 /* *(d + 4) = t1 */ | ||
568 | bsrli r11, r12, 16 /* h = v >> 16 */ | ||
569 | lwi r12, r8, 0 /* v = *(as + 0) */ | ||
570 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
571 | or r9, r11, r9 /* t1 = h | t1 */ | ||
572 | swi r9, r5, 0 /* *(d + 0) = t1 */ | ||
573 | addi r4, r4, -32 /* n = n - 32 */ | ||
574 | bneid r4, d_bu2_loop /* while (n) loop */ | ||
575 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ | ||
576 | |||
577 | d_block_done: | ||
578 | addi r4, r0, 4 /* n = 4 */ | ||
579 | cmpu r4, r4, r7 /* n = c - n (unsigned) */ | ||
580 | blti r4,d_xfer_end /* if n < 0, less than one word to transfer */ | ||
581 | |||
582 | d_word_xfer: | ||
583 | andi r4, r7, 0xfffffffc /* n = c & ~3 */ | ||
584 | rsub r5, r4, r5 /* d = d - n */ | ||
585 | rsub r6, r4, r6 /* s = s - n */ | ||
586 | rsub r7, r4, r7 /* c = c - n */ | ||
587 | |||
588 | andi r9, r6, 3 /* t1 = s & 3 */ | ||
589 | /* if temp != 0, unaligned transfers needed */ | ||
590 | bnei r9, d_word_unaligned | ||
591 | |||
592 | d_word_aligned: | ||
593 | addi r4, r4,-4 /* n-- */ | ||
594 | lw r9, r6, r4 /* t1 = *(s+n) */ | ||
595 | bneid r4, d_word_aligned /* loop */ | ||
596 | sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */ | ||
597 | |||
598 | bri d_word_done | ||
599 | |||
600 | d_word_unaligned: | ||
601 | andi r8, r6, 0xfffffffc /* as = s & ~3 */ | ||
602 | lw r11, r8, r4 /* h = *(as + n) */ | ||
603 | |||
604 | addi r9, r9, -1 | ||
605 | beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */ | ||
606 | addi r9, r9, -1 | ||
607 | beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */ | ||
608 | |||
609 | d_word_u3: | ||
610 | bsrli r11, r11, 8 /* h = h >> 8 */ | ||
611 | d_wu3_loop: | ||
612 | addi r4, r4,-4 /* n = n - 4 */ | ||
613 | lw r12, r8, r4 /* v = *(as + n) */ | ||
614 | bslli r9, r12, 24 /* t1 = v << 24 */ | ||
615 | or r9, r11, r9 /* t1 = h | t1 */ | ||
616 | sw r9, r5, r4 /* *(d + n) = t1 */ | ||
617 | bneid r4, d_wu3_loop /* while (n) loop */ | ||
618 | bsrli r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */ | ||
619 | |||
620 | bri d_word_done | ||
621 | |||
622 | d_word_u1: | ||
623 | bsrli r11, r11, 24 /* h = h >> 24 */ | ||
624 | d_wu1_loop: | ||
625 | addi r4, r4,-4 /* n = n - 4 */ | ||
626 | lw r12, r8, r4 /* v = *(as + n) */ | ||
627 | bslli r9, r12, 8 /* t1 = v << 8 */ | ||
628 | or r9, r11, r9 /* t1 = h | t1 */ | ||
629 | sw r9, r5, r4 /* *(d + n) = t1 */ | ||
630 | bneid r4, d_wu1_loop /* while (n) loop */ | ||
631 | bsrli r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */ | ||
632 | |||
633 | bri d_word_done | ||
634 | |||
635 | d_word_u2: | ||
636 | bsrli r11, r11, 16 /* h = h >> 16 */ | ||
637 | d_wu2_loop: | ||
638 | addi r4, r4,-4 /* n = n - 4 */ | ||
639 | lw r12, r8, r4 /* v = *(as + n) */ | ||
640 | bslli r9, r12, 16 /* t1 = v << 16 */ | ||
641 | or r9, r11, r9 /* t1 = h | t1 */ | ||
642 | sw r9, r5, r4 /* *(d + n) = t1 */ | ||
643 | bneid r4, d_wu2_loop /* while (n) loop */ | ||
644 | bsrli r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */ | ||
645 | |||
646 | d_word_done: | ||
647 | |||
648 | d_xfer_end: | ||
649 | d_xfer_end_loop: | ||
650 | beqi r7, a_done /* while (c) */ | ||
651 | addi r6, r6, -1 /* s-- */ | ||
652 | lbui r9, r6, 0 /* t1 = *s */ | ||
653 | addi r5, r5, -1 /* d-- */ | ||
654 | sbi r9, r5, 0 /* *d = t1 */ | ||
655 | brid d_xfer_end_loop /* loop */ | ||
656 | addi r7, r7, -1 /* c-- (IN DELAY SLOT) */ | ||
657 | |||
658 | d_done: | ||
659 | rtsd r15, 8 | ||
660 | nop | ||
661 | |||
662 | .end memmove | ||
diff --git a/arch/microblaze/lib/memcpy.c b/arch/microblaze/lib/memcpy.c new file mode 100644 index 00000000000..5880119c448 --- /dev/null +++ b/arch/microblaze/lib/memcpy.c | |||
@@ -0,0 +1,161 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> | ||
3 | * Copyright (C) 2008-2009 PetaLogix | ||
4 | * Copyright (C) 2007 John Williams | ||
5 | * | ||
6 | * Reasonably optimised generic C-code for memcpy on Microblaze | ||
7 | * This is generic C code to do efficient, alignment-aware memcpy. | ||
8 | * | ||
9 | * It is based on demo code originally Copyright 2001 by Intel Corp, taken from | ||
10 | * http://www.embedded.com/showArticle.jhtml?articleID=19205567 | ||
11 | * | ||
12 | * Attempts were made, unsuccesfully, to contact the original | ||
13 | * author of this code (Michael Morrow, Intel). Below is the original | ||
14 | * copyright notice. | ||
15 | * | ||
16 | * This software has been developed by Intel Corporation. | ||
17 | * Intel specifically disclaims all warranties, express or | ||
18 | * implied, and all liability, including consequential and | ||
19 | * other indirect damages, for the use of this program, including | ||
20 | * liability for infringement of any proprietary rights, | ||
21 | * and including the warranties of merchantability and fitness | ||
22 | * for a particular purpose. Intel does not assume any | ||
23 | * responsibility for and errors which may appear in this program | ||
24 | * not any responsibility to update it. | ||
25 | */ | ||
26 | |||
27 | #include <linux/types.h> | ||
28 | #include <linux/stddef.h> | ||
29 | #include <linux/compiler.h> | ||
30 | #include <linux/module.h> | ||
31 | |||
32 | #include <linux/string.h> | ||
33 | #include <asm/system.h> | ||
34 | |||
35 | #ifdef __HAVE_ARCH_MEMCPY | ||
36 | void *memcpy(void *v_dst, const void *v_src, __kernel_size_t c) | ||
37 | { | ||
38 | const char *src = v_src; | ||
39 | char *dst = v_dst; | ||
40 | #ifndef CONFIG_OPT_LIB_FUNCTION | ||
41 | /* Simple, byte oriented memcpy. */ | ||
42 | while (c--) | ||
43 | *dst++ = *src++; | ||
44 | |||
45 | return v_dst; | ||
46 | #else | ||
47 | /* The following code tries to optimize the copy by using unsigned | ||
48 | * alignment. This will work fine if both source and destination are | ||
49 | * aligned on the same boundary. However, if they are aligned on | ||
50 | * different boundaries shifts will be necessary. This might result in | ||
51 | * bad performance on MicroBlaze systems without a barrel shifter. | ||
52 | */ | ||
53 | const uint32_t *i_src; | ||
54 | uint32_t *i_dst; | ||
55 | |||
56 | if (c >= 4) { | ||
57 | unsigned value, buf_hold; | ||
58 | |||
59 | /* Align the dstination to a word boundry. */ | ||
60 | /* This is done in an endian independant manner. */ | ||
61 | switch ((unsigned long)dst & 3) { | ||
62 | case 1: | ||
63 | *dst++ = *src++; | ||
64 | --c; | ||
65 | case 2: | ||
66 | *dst++ = *src++; | ||
67 | --c; | ||
68 | case 3: | ||
69 | *dst++ = *src++; | ||
70 | --c; | ||
71 | } | ||
72 | |||
73 | i_dst = (void *)dst; | ||
74 | |||
75 | /* Choose a copy scheme based on the source */ | ||
76 | /* alignment relative to dstination. */ | ||
77 | switch ((unsigned long)src & 3) { | ||
78 | case 0x0: /* Both byte offsets are aligned */ | ||
79 | i_src = (const void *)src; | ||
80 | |||
81 | for (; c >= 4; c -= 4) | ||
82 | *i_dst++ = *i_src++; | ||
83 | |||
84 | src = (const void *)i_src; | ||
85 | break; | ||
86 | case 0x1: /* Unaligned - Off by 1 */ | ||
87 | /* Word align the source */ | ||
88 | i_src = (const void *) ((unsigned)src & ~3); | ||
89 | |||
90 | /* Load the holding buffer */ | ||
91 | buf_hold = *i_src++ << 8; | ||
92 | |||
93 | for (; c >= 4; c -= 4) { | ||
94 | value = *i_src++; | ||
95 | *i_dst++ = buf_hold | value >> 24; | ||
96 | buf_hold = value << 8; | ||
97 | } | ||
98 | |||
99 | /* Realign the source */ | ||
100 | src = (const void *)i_src; | ||
101 | src -= 3; | ||
102 | break; | ||
103 | case 0x2: /* Unaligned - Off by 2 */ | ||
104 | /* Word align the source */ | ||
105 | i_src = (const void *) ((unsigned)src & ~3); | ||
106 | |||
107 | /* Load the holding buffer */ | ||
108 | buf_hold = *i_src++ << 16; | ||
109 | |||
110 | for (; c >= 4; c -= 4) { | ||
111 | value = *i_src++; | ||
112 | *i_dst++ = buf_hold | value >> 16; | ||
113 | buf_hold = value << 16; | ||
114 | } | ||
115 | |||
116 | /* Realign the source */ | ||
117 | src = (const void *)i_src; | ||
118 | src -= 2; | ||
119 | break; | ||
120 | case 0x3: /* Unaligned - Off by 3 */ | ||
121 | /* Word align the source */ | ||
122 | i_src = (const void *) ((unsigned)src & ~3); | ||
123 | |||
124 | /* Load the holding buffer */ | ||
125 | buf_hold = *i_src++ << 24; | ||
126 | |||
127 | for (; c >= 4; c -= 4) { | ||
128 | value = *i_src++; | ||
129 | *i_dst++ = buf_hold | value >> 8; | ||
130 | buf_hold = value << 24; | ||
131 | } | ||
132 | |||
133 | /* Realign the source */ | ||
134 | src = (const void *)i_src; | ||
135 | src -= 1; | ||
136 | break; | ||
137 | } | ||
138 | dst = (void *)i_dst; | ||
139 | } | ||
140 | |||
141 | /* Finish off any remaining bytes */ | ||
142 | /* simple fast copy, ... unless a cache boundry is crossed */ | ||
143 | switch (c) { | ||
144 | case 3: | ||
145 | *dst++ = *src++; | ||
146 | case 2: | ||
147 | *dst++ = *src++; | ||
148 | case 1: | ||
149 | *dst++ = *src++; | ||
150 | } | ||
151 | |||
152 | return v_dst; | ||
153 | #endif | ||
154 | } | ||
155 | EXPORT_SYMBOL(memcpy); | ||
156 | #endif /* __HAVE_ARCH_MEMCPY */ | ||
157 | |||
158 | void *cacheable_memcpy(void *d, const void *s, __kernel_size_t c) | ||
159 | { | ||
160 | return memcpy(d, s, c); | ||
161 | } | ||
diff --git a/arch/microblaze/lib/memmove.c b/arch/microblaze/lib/memmove.c new file mode 100644 index 00000000000..d4e9f49a71f --- /dev/null +++ b/arch/microblaze/lib/memmove.c | |||
@@ -0,0 +1,175 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> | ||
3 | * Copyright (C) 2008-2009 PetaLogix | ||
4 | * Copyright (C) 2007 John Williams | ||
5 | * | ||
6 | * Reasonably optimised generic C-code for memcpy on Microblaze | ||
7 | * This is generic C code to do efficient, alignment-aware memmove. | ||
8 | * | ||
9 | * It is based on demo code originally Copyright 2001 by Intel Corp, taken from | ||
10 | * http://www.embedded.com/showArticle.jhtml?articleID=19205567 | ||
11 | * | ||
12 | * Attempts were made, unsuccesfully, to contact the original | ||
13 | * author of this code (Michael Morrow, Intel). Below is the original | ||
14 | * copyright notice. | ||
15 | * | ||
16 | * This software has been developed by Intel Corporation. | ||
17 | * Intel specifically disclaims all warranties, express or | ||
18 | * implied, and all liability, including consequential and | ||
19 | * other indirect damages, for the use of this program, including | ||
20 | * liability for infringement of any proprietary rights, | ||
21 | * and including the warranties of merchantability and fitness | ||
22 | * for a particular purpose. Intel does not assume any | ||
23 | * responsibility for and errors which may appear in this program | ||
24 | * not any responsibility to update it. | ||
25 | */ | ||
26 | |||
27 | #include <linux/types.h> | ||
28 | #include <linux/stddef.h> | ||
29 | #include <linux/compiler.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/string.h> | ||
32 | |||
33 | #ifdef __HAVE_ARCH_MEMMOVE | ||
34 | void *memmove(void *v_dst, const void *v_src, __kernel_size_t c) | ||
35 | { | ||
36 | const char *src = v_src; | ||
37 | char *dst = v_dst; | ||
38 | |||
39 | #ifdef CONFIG_OPT_LIB_FUNCTION | ||
40 | const uint32_t *i_src; | ||
41 | uint32_t *i_dst; | ||
42 | #endif | ||
43 | |||
44 | if (!c) | ||
45 | return v_dst; | ||
46 | |||
47 | /* Use memcpy when source is higher than dest */ | ||
48 | if (v_dst <= v_src) | ||
49 | return memcpy(v_dst, v_src, c); | ||
50 | |||
51 | #ifndef CONFIG_OPT_LIB_FUNCTION | ||
52 | /* copy backwards, from end to beginning */ | ||
53 | src += c; | ||
54 | dst += c; | ||
55 | |||
56 | /* Simple, byte oriented memmove. */ | ||
57 | while (c--) | ||
58 | *--dst = *--src; | ||
59 | |||
60 | return v_dst; | ||
61 | #else | ||
62 | /* The following code tries to optimize the copy by using unsigned | ||
63 | * alignment. This will work fine if both source and destination are | ||
64 | * aligned on the same boundary. However, if they are aligned on | ||
65 | * different boundaries shifts will be necessary. This might result in | ||
66 | * bad performance on MicroBlaze systems without a barrel shifter. | ||
67 | */ | ||
68 | /* FIXME this part needs more test */ | ||
69 | /* Do a descending copy - this is a bit trickier! */ | ||
70 | dst += c; | ||
71 | src += c; | ||
72 | |||
73 | if (c >= 4) { | ||
74 | unsigned value, buf_hold; | ||
75 | |||
76 | /* Align the destination to a word boundry. */ | ||
77 | /* This is done in an endian independant manner. */ | ||
78 | |||
79 | switch ((unsigned long)dst & 3) { | ||
80 | case 3: | ||
81 | *--dst = *--src; | ||
82 | --c; | ||
83 | case 2: | ||
84 | *--dst = *--src; | ||
85 | --c; | ||
86 | case 1: | ||
87 | *--dst = *--src; | ||
88 | --c; | ||
89 | } | ||
90 | |||
91 | i_dst = (void *)dst; | ||
92 | /* Choose a copy scheme based on the source */ | ||
93 | /* alignment relative to dstination. */ | ||
94 | switch ((unsigned long)src & 3) { | ||
95 | case 0x0: /* Both byte offsets are aligned */ | ||
96 | |||
97 | i_src = (const void *)src; | ||
98 | |||
99 | for (; c >= 4; c -= 4) | ||
100 | *--i_dst = *--i_src; | ||
101 | |||
102 | src = (const void *)i_src; | ||
103 | break; | ||
104 | case 0x1: /* Unaligned - Off by 1 */ | ||
105 | /* Word align the source */ | ||
106 | i_src = (const void *) (((unsigned)src + 4) & ~3); | ||
107 | |||
108 | /* Load the holding buffer */ | ||
109 | buf_hold = *--i_src >> 24; | ||
110 | |||
111 | for (; c >= 4; c -= 4) { | ||
112 | value = *--i_src; | ||
113 | *--i_dst = buf_hold << 8 | value; | ||
114 | buf_hold = value >> 24; | ||
115 | } | ||
116 | |||
117 | /* Realign the source */ | ||
118 | src = (const void *)i_src; | ||
119 | src += 1; | ||
120 | break; | ||
121 | case 0x2: /* Unaligned - Off by 2 */ | ||
122 | /* Word align the source */ | ||
123 | i_src = (const void *) (((unsigned)src + 4) & ~3); | ||
124 | |||
125 | /* Load the holding buffer */ | ||
126 | buf_hold = *--i_src >> 16; | ||
127 | |||
128 | for (; c >= 4; c -= 4) { | ||
129 | value = *--i_src; | ||
130 | *--i_dst = buf_hold << 16 | value; | ||
131 | buf_hold = value >> 16; | ||
132 | } | ||
133 | |||
134 | /* Realign the source */ | ||
135 | src = (const void *)i_src; | ||
136 | src += 2; | ||
137 | break; | ||
138 | case 0x3: /* Unaligned - Off by 3 */ | ||
139 | /* Word align the source */ | ||
140 | i_src = (const void *) (((unsigned)src + 4) & ~3); | ||
141 | |||
142 | /* Load the holding buffer */ | ||
143 | buf_hold = *--i_src >> 8; | ||
144 | |||
145 | for (; c >= 4; c -= 4) { | ||
146 | value = *--i_src; | ||
147 | *--i_dst = buf_hold << 24 | value; | ||
148 | buf_hold = value >> 8; | ||
149 | } | ||
150 | |||
151 | /* Realign the source */ | ||
152 | src = (const void *)i_src; | ||
153 | src += 3; | ||
154 | break; | ||
155 | } | ||
156 | dst = (void *)i_dst; | ||
157 | } | ||
158 | |||
159 | /* simple fast copy, ... unless a cache boundry is crossed */ | ||
160 | /* Finish off any remaining bytes */ | ||
161 | switch (c) { | ||
162 | case 4: | ||
163 | *--dst = *--src; | ||
164 | case 3: | ||
165 | *--dst = *--src; | ||
166 | case 2: | ||
167 | *--dst = *--src; | ||
168 | case 1: | ||
169 | *--dst = *--src; | ||
170 | } | ||
171 | return v_dst; | ||
172 | #endif | ||
173 | } | ||
174 | EXPORT_SYMBOL(memmove); | ||
175 | #endif /* __HAVE_ARCH_MEMMOVE */ | ||
diff --git a/arch/microblaze/lib/memset.c b/arch/microblaze/lib/memset.c new file mode 100644 index 00000000000..941dc8f94b0 --- /dev/null +++ b/arch/microblaze/lib/memset.c | |||
@@ -0,0 +1,82 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu> | ||
3 | * Copyright (C) 2008-2009 PetaLogix | ||
4 | * Copyright (C) 2007 John Williams | ||
5 | * | ||
6 | * Reasonably optimised generic C-code for memset on Microblaze | ||
7 | * This is generic C code to do efficient, alignment-aware memcpy. | ||
8 | * | ||
9 | * It is based on demo code originally Copyright 2001 by Intel Corp, taken from | ||
10 | * http://www.embedded.com/showArticle.jhtml?articleID=19205567 | ||
11 | * | ||
12 | * Attempts were made, unsuccesfully, to contact the original | ||
13 | * author of this code (Michael Morrow, Intel). Below is the original | ||
14 | * copyright notice. | ||
15 | * | ||
16 | * This software has been developed by Intel Corporation. | ||
17 | * Intel specifically disclaims all warranties, express or | ||
18 | * implied, and all liability, including consequential and | ||
19 | * other indirect damages, for the use of this program, including | ||
20 | * liability for infringement of any proprietary rights, | ||
21 | * and including the warranties of merchantability and fitness | ||
22 | * for a particular purpose. Intel does not assume any | ||
23 | * responsibility for and errors which may appear in this program | ||
24 | * not any responsibility to update it. | ||
25 | */ | ||
26 | |||
27 | #include <linux/types.h> | ||
28 | #include <linux/stddef.h> | ||
29 | #include <linux/compiler.h> | ||
30 | #include <linux/module.h> | ||
31 | #include <linux/string.h> | ||
32 | |||
33 | #ifdef __HAVE_ARCH_MEMSET | ||
34 | void *memset(void *v_src, int c, __kernel_size_t n) | ||
35 | { | ||
36 | |||
37 | char *src = v_src; | ||
38 | #ifdef CONFIG_OPT_LIB_FUNCTION | ||
39 | uint32_t *i_src; | ||
40 | uint32_t w32; | ||
41 | #endif | ||
42 | /* Truncate c to 8 bits */ | ||
43 | c = (c & 0xFF); | ||
44 | |||
45 | #ifdef CONFIG_OPT_LIB_FUNCTION | ||
46 | /* Make a repeating word out of it */ | ||
47 | w32 = c; | ||
48 | w32 |= w32 << 8; | ||
49 | w32 |= w32 << 16; | ||
50 | |||
51 | if (n >= 4) { | ||
52 | /* Align the destination to a word boundary */ | ||
53 | /* This is done in an endian independant manner */ | ||
54 | switch ((unsigned) src & 3) { | ||
55 | case 1: | ||
56 | *src++ = c; | ||
57 | --n; | ||
58 | case 2: | ||
59 | *src++ = c; | ||
60 | --n; | ||
61 | case 3: | ||
62 | *src++ = c; | ||
63 | --n; | ||
64 | } | ||
65 | |||
66 | i_src = (void *)src; | ||
67 | |||
68 | /* Do as many full-word copies as we can */ | ||
69 | for (; n >= 4; n -= 4) | ||
70 | *i_src++ = w32; | ||
71 | |||
72 | src = (void *)i_src; | ||
73 | } | ||
74 | #endif | ||
75 | /* Simple, byte oriented memset or the rest of count. */ | ||
76 | while (n--) | ||
77 | *src++ = c; | ||
78 | |||
79 | return v_src; | ||
80 | } | ||
81 | EXPORT_SYMBOL(memset); | ||
82 | #endif /* __HAVE_ARCH_MEMSET */ | ||