diff options
Diffstat (limited to 'arch/powerpc/lib/copy32.S')
-rw-r--r-- | arch/powerpc/lib/copy32.S | 543 |
1 files changed, 543 insertions, 0 deletions
diff --git a/arch/powerpc/lib/copy32.S b/arch/powerpc/lib/copy32.S new file mode 100644 index 000000000000..420a912198a2 --- /dev/null +++ b/arch/powerpc/lib/copy32.S | |||
@@ -0,0 +1,543 @@ | |||
1 | /* | ||
2 | * Memory copy functions for 32-bit PowerPC. | ||
3 | * | ||
4 | * Copyright (C) 1996-2005 Paul Mackerras. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or | ||
7 | * modify it under the terms of the GNU General Public License | ||
8 | * as published by the Free Software Foundation; either version | ||
9 | * 2 of the License, or (at your option) any later version. | ||
10 | */ | ||
11 | #include <linux/config.h> | ||
12 | #include <asm/processor.h> | ||
13 | #include <asm/cache.h> | ||
14 | #include <asm/errno.h> | ||
15 | #include <asm/ppc_asm.h> | ||
16 | |||
17 | #define COPY_16_BYTES \ | ||
18 | lwz r7,4(r4); \ | ||
19 | lwz r8,8(r4); \ | ||
20 | lwz r9,12(r4); \ | ||
21 | lwzu r10,16(r4); \ | ||
22 | stw r7,4(r6); \ | ||
23 | stw r8,8(r6); \ | ||
24 | stw r9,12(r6); \ | ||
25 | stwu r10,16(r6) | ||
26 | |||
27 | #define COPY_16_BYTES_WITHEX(n) \ | ||
28 | 8 ## n ## 0: \ | ||
29 | lwz r7,4(r4); \ | ||
30 | 8 ## n ## 1: \ | ||
31 | lwz r8,8(r4); \ | ||
32 | 8 ## n ## 2: \ | ||
33 | lwz r9,12(r4); \ | ||
34 | 8 ## n ## 3: \ | ||
35 | lwzu r10,16(r4); \ | ||
36 | 8 ## n ## 4: \ | ||
37 | stw r7,4(r6); \ | ||
38 | 8 ## n ## 5: \ | ||
39 | stw r8,8(r6); \ | ||
40 | 8 ## n ## 6: \ | ||
41 | stw r9,12(r6); \ | ||
42 | 8 ## n ## 7: \ | ||
43 | stwu r10,16(r6) | ||
44 | |||
45 | #define COPY_16_BYTES_EXCODE(n) \ | ||
46 | 9 ## n ## 0: \ | ||
47 | addi r5,r5,-(16 * n); \ | ||
48 | b 104f; \ | ||
49 | 9 ## n ## 1: \ | ||
50 | addi r5,r5,-(16 * n); \ | ||
51 | b 105f; \ | ||
52 | .section __ex_table,"a"; \ | ||
53 | .align 2; \ | ||
54 | .long 8 ## n ## 0b,9 ## n ## 0b; \ | ||
55 | .long 8 ## n ## 1b,9 ## n ## 0b; \ | ||
56 | .long 8 ## n ## 2b,9 ## n ## 0b; \ | ||
57 | .long 8 ## n ## 3b,9 ## n ## 0b; \ | ||
58 | .long 8 ## n ## 4b,9 ## n ## 1b; \ | ||
59 | .long 8 ## n ## 5b,9 ## n ## 1b; \ | ||
60 | .long 8 ## n ## 6b,9 ## n ## 1b; \ | ||
61 | .long 8 ## n ## 7b,9 ## n ## 1b; \ | ||
62 | .text | ||
63 | |||
64 | .text | ||
65 | .stabs "arch/powerpc/lib/",N_SO,0,0,0f | ||
66 | .stabs "copy32.S",N_SO,0,0,0f | ||
67 | 0: | ||
68 | |||
69 | CACHELINE_BYTES = L1_CACHE_LINE_SIZE | ||
70 | LG_CACHELINE_BYTES = LG_L1_CACHE_LINE_SIZE | ||
71 | CACHELINE_MASK = (L1_CACHE_LINE_SIZE-1) | ||
72 | |||
73 | /* | ||
74 | * Use dcbz on the complete cache lines in the destination | ||
75 | * to set them to zero. This requires that the destination | ||
76 | * area is cacheable. -- paulus | ||
77 | */ | ||
78 | _GLOBAL(cacheable_memzero) | ||
79 | mr r5,r4 | ||
80 | li r4,0 | ||
81 | addi r6,r3,-4 | ||
82 | cmplwi 0,r5,4 | ||
83 | blt 7f | ||
84 | stwu r4,4(r6) | ||
85 | beqlr | ||
86 | andi. r0,r6,3 | ||
87 | add r5,r0,r5 | ||
88 | subf r6,r0,r6 | ||
89 | clrlwi r7,r6,32-LG_CACHELINE_BYTES | ||
90 | add r8,r7,r5 | ||
91 | srwi r9,r8,LG_CACHELINE_BYTES | ||
92 | addic. r9,r9,-1 /* total number of complete cachelines */ | ||
93 | ble 2f | ||
94 | xori r0,r7,CACHELINE_MASK & ~3 | ||
95 | srwi. r0,r0,2 | ||
96 | beq 3f | ||
97 | mtctr r0 | ||
98 | 4: stwu r4,4(r6) | ||
99 | bdnz 4b | ||
100 | 3: mtctr r9 | ||
101 | li r7,4 | ||
102 | #if !defined(CONFIG_8xx) | ||
103 | 10: dcbz r7,r6 | ||
104 | #else | ||
105 | 10: stw r4, 4(r6) | ||
106 | stw r4, 8(r6) | ||
107 | stw r4, 12(r6) | ||
108 | stw r4, 16(r6) | ||
109 | #if CACHE_LINE_SIZE >= 32 | ||
110 | stw r4, 20(r6) | ||
111 | stw r4, 24(r6) | ||
112 | stw r4, 28(r6) | ||
113 | stw r4, 32(r6) | ||
114 | #endif /* CACHE_LINE_SIZE */ | ||
115 | #endif | ||
116 | addi r6,r6,CACHELINE_BYTES | ||
117 | bdnz 10b | ||
118 | clrlwi r5,r8,32-LG_CACHELINE_BYTES | ||
119 | addi r5,r5,4 | ||
120 | 2: srwi r0,r5,2 | ||
121 | mtctr r0 | ||
122 | bdz 6f | ||
123 | 1: stwu r4,4(r6) | ||
124 | bdnz 1b | ||
125 | 6: andi. r5,r5,3 | ||
126 | 7: cmpwi 0,r5,0 | ||
127 | beqlr | ||
128 | mtctr r5 | ||
129 | addi r6,r6,3 | ||
130 | 8: stbu r4,1(r6) | ||
131 | bdnz 8b | ||
132 | blr | ||
133 | |||
134 | _GLOBAL(memset) | ||
135 | rlwimi r4,r4,8,16,23 | ||
136 | rlwimi r4,r4,16,0,15 | ||
137 | addi r6,r3,-4 | ||
138 | cmplwi 0,r5,4 | ||
139 | blt 7f | ||
140 | stwu r4,4(r6) | ||
141 | beqlr | ||
142 | andi. r0,r6,3 | ||
143 | add r5,r0,r5 | ||
144 | subf r6,r0,r6 | ||
145 | srwi r0,r5,2 | ||
146 | mtctr r0 | ||
147 | bdz 6f | ||
148 | 1: stwu r4,4(r6) | ||
149 | bdnz 1b | ||
150 | 6: andi. r5,r5,3 | ||
151 | 7: cmpwi 0,r5,0 | ||
152 | beqlr | ||
153 | mtctr r5 | ||
154 | addi r6,r6,3 | ||
155 | 8: stbu r4,1(r6) | ||
156 | bdnz 8b | ||
157 | blr | ||
158 | |||
159 | /* | ||
160 | * This version uses dcbz on the complete cache lines in the | ||
161 | * destination area to reduce memory traffic. This requires that | ||
162 | * the destination area is cacheable. | ||
163 | * We only use this version if the source and dest don't overlap. | ||
164 | * -- paulus. | ||
165 | */ | ||
166 | _GLOBAL(cacheable_memcpy) | ||
167 | add r7,r3,r5 /* test if the src & dst overlap */ | ||
168 | add r8,r4,r5 | ||
169 | cmplw 0,r4,r7 | ||
170 | cmplw 1,r3,r8 | ||
171 | crand 0,0,4 /* cr0.lt &= cr1.lt */ | ||
172 | blt memcpy /* if regions overlap */ | ||
173 | |||
174 | addi r4,r4,-4 | ||
175 | addi r6,r3,-4 | ||
176 | neg r0,r3 | ||
177 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ | ||
178 | beq 58f | ||
179 | |||
180 | cmplw 0,r5,r0 /* is this more than total to do? */ | ||
181 | blt 63f /* if not much to do */ | ||
182 | andi. r8,r0,3 /* get it word-aligned first */ | ||
183 | subf r5,r0,r5 | ||
184 | mtctr r8 | ||
185 | beq+ 61f | ||
186 | 70: lbz r9,4(r4) /* do some bytes */ | ||
187 | stb r9,4(r6) | ||
188 | addi r4,r4,1 | ||
189 | addi r6,r6,1 | ||
190 | bdnz 70b | ||
191 | 61: srwi. r0,r0,2 | ||
192 | mtctr r0 | ||
193 | beq 58f | ||
194 | 72: lwzu r9,4(r4) /* do some words */ | ||
195 | stwu r9,4(r6) | ||
196 | bdnz 72b | ||
197 | |||
198 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ | ||
199 | clrlwi r5,r5,32-LG_CACHELINE_BYTES | ||
200 | li r11,4 | ||
201 | mtctr r0 | ||
202 | beq 63f | ||
203 | 53: | ||
204 | #if !defined(CONFIG_8xx) | ||
205 | dcbz r11,r6 | ||
206 | #endif | ||
207 | COPY_16_BYTES | ||
208 | #if L1_CACHE_LINE_SIZE >= 32 | ||
209 | COPY_16_BYTES | ||
210 | #if L1_CACHE_LINE_SIZE >= 64 | ||
211 | COPY_16_BYTES | ||
212 | COPY_16_BYTES | ||
213 | #if L1_CACHE_LINE_SIZE >= 128 | ||
214 | COPY_16_BYTES | ||
215 | COPY_16_BYTES | ||
216 | COPY_16_BYTES | ||
217 | COPY_16_BYTES | ||
218 | #endif | ||
219 | #endif | ||
220 | #endif | ||
221 | bdnz 53b | ||
222 | |||
223 | 63: srwi. r0,r5,2 | ||
224 | mtctr r0 | ||
225 | beq 64f | ||
226 | 30: lwzu r0,4(r4) | ||
227 | stwu r0,4(r6) | ||
228 | bdnz 30b | ||
229 | |||
230 | 64: andi. r0,r5,3 | ||
231 | mtctr r0 | ||
232 | beq+ 65f | ||
233 | 40: lbz r0,4(r4) | ||
234 | stb r0,4(r6) | ||
235 | addi r4,r4,1 | ||
236 | addi r6,r6,1 | ||
237 | bdnz 40b | ||
238 | 65: blr | ||
239 | |||
240 | _GLOBAL(memmove) | ||
241 | cmplw 0,r3,r4 | ||
242 | bgt backwards_memcpy | ||
243 | /* fall through */ | ||
244 | |||
245 | _GLOBAL(memcpy) | ||
246 | srwi. r7,r5,3 | ||
247 | addi r6,r3,-4 | ||
248 | addi r4,r4,-4 | ||
249 | beq 2f /* if less than 8 bytes to do */ | ||
250 | andi. r0,r6,3 /* get dest word aligned */ | ||
251 | mtctr r7 | ||
252 | bne 5f | ||
253 | 1: lwz r7,4(r4) | ||
254 | lwzu r8,8(r4) | ||
255 | stw r7,4(r6) | ||
256 | stwu r8,8(r6) | ||
257 | bdnz 1b | ||
258 | andi. r5,r5,7 | ||
259 | 2: cmplwi 0,r5,4 | ||
260 | blt 3f | ||
261 | lwzu r0,4(r4) | ||
262 | addi r5,r5,-4 | ||
263 | stwu r0,4(r6) | ||
264 | 3: cmpwi 0,r5,0 | ||
265 | beqlr | ||
266 | mtctr r5 | ||
267 | addi r4,r4,3 | ||
268 | addi r6,r6,3 | ||
269 | 4: lbzu r0,1(r4) | ||
270 | stbu r0,1(r6) | ||
271 | bdnz 4b | ||
272 | blr | ||
273 | 5: subfic r0,r0,4 | ||
274 | mtctr r0 | ||
275 | 6: lbz r7,4(r4) | ||
276 | addi r4,r4,1 | ||
277 | stb r7,4(r6) | ||
278 | addi r6,r6,1 | ||
279 | bdnz 6b | ||
280 | subf r5,r0,r5 | ||
281 | rlwinm. r7,r5,32-3,3,31 | ||
282 | beq 2b | ||
283 | mtctr r7 | ||
284 | b 1b | ||
285 | |||
286 | _GLOBAL(backwards_memcpy) | ||
287 | rlwinm. r7,r5,32-3,3,31 /* r0 = r5 >> 3 */ | ||
288 | add r6,r3,r5 | ||
289 | add r4,r4,r5 | ||
290 | beq 2f | ||
291 | andi. r0,r6,3 | ||
292 | mtctr r7 | ||
293 | bne 5f | ||
294 | 1: lwz r7,-4(r4) | ||
295 | lwzu r8,-8(r4) | ||
296 | stw r7,-4(r6) | ||
297 | stwu r8,-8(r6) | ||
298 | bdnz 1b | ||
299 | andi. r5,r5,7 | ||
300 | 2: cmplwi 0,r5,4 | ||
301 | blt 3f | ||
302 | lwzu r0,-4(r4) | ||
303 | subi r5,r5,4 | ||
304 | stwu r0,-4(r6) | ||
305 | 3: cmpwi 0,r5,0 | ||
306 | beqlr | ||
307 | mtctr r5 | ||
308 | 4: lbzu r0,-1(r4) | ||
309 | stbu r0,-1(r6) | ||
310 | bdnz 4b | ||
311 | blr | ||
312 | 5: mtctr r0 | ||
313 | 6: lbzu r7,-1(r4) | ||
314 | stbu r7,-1(r6) | ||
315 | bdnz 6b | ||
316 | subf r5,r0,r5 | ||
317 | rlwinm. r7,r5,32-3,3,31 | ||
318 | beq 2b | ||
319 | mtctr r7 | ||
320 | b 1b | ||
321 | |||
322 | _GLOBAL(__copy_tofrom_user) | ||
323 | addi r4,r4,-4 | ||
324 | addi r6,r3,-4 | ||
325 | neg r0,r3 | ||
326 | andi. r0,r0,CACHELINE_MASK /* # bytes to start of cache line */ | ||
327 | beq 58f | ||
328 | |||
329 | cmplw 0,r5,r0 /* is this more than total to do? */ | ||
330 | blt 63f /* if not much to do */ | ||
331 | andi. r8,r0,3 /* get it word-aligned first */ | ||
332 | mtctr r8 | ||
333 | beq+ 61f | ||
334 | 70: lbz r9,4(r4) /* do some bytes */ | ||
335 | 71: stb r9,4(r6) | ||
336 | addi r4,r4,1 | ||
337 | addi r6,r6,1 | ||
338 | bdnz 70b | ||
339 | 61: subf r5,r0,r5 | ||
340 | srwi. r0,r0,2 | ||
341 | mtctr r0 | ||
342 | beq 58f | ||
343 | 72: lwzu r9,4(r4) /* do some words */ | ||
344 | 73: stwu r9,4(r6) | ||
345 | bdnz 72b | ||
346 | |||
347 | .section __ex_table,"a" | ||
348 | .align 2 | ||
349 | .long 70b,100f | ||
350 | .long 71b,101f | ||
351 | .long 72b,102f | ||
352 | .long 73b,103f | ||
353 | .text | ||
354 | |||
355 | 58: srwi. r0,r5,LG_CACHELINE_BYTES /* # complete cachelines */ | ||
356 | clrlwi r5,r5,32-LG_CACHELINE_BYTES | ||
357 | li r11,4 | ||
358 | beq 63f | ||
359 | |||
360 | #ifdef CONFIG_8xx | ||
361 | /* Don't use prefetch on 8xx */ | ||
362 | mtctr r0 | ||
363 | li r0,0 | ||
364 | 53: COPY_16_BYTES_WITHEX(0) | ||
365 | bdnz 53b | ||
366 | |||
367 | #else /* not CONFIG_8xx */ | ||
368 | /* Here we decide how far ahead to prefetch the source */ | ||
369 | li r3,4 | ||
370 | cmpwi r0,1 | ||
371 | li r7,0 | ||
372 | ble 114f | ||
373 | li r7,1 | ||
374 | #if MAX_COPY_PREFETCH > 1 | ||
375 | /* Heuristically, for large transfers we prefetch | ||
376 | MAX_COPY_PREFETCH cachelines ahead. For small transfers | ||
377 | we prefetch 1 cacheline ahead. */ | ||
378 | cmpwi r0,MAX_COPY_PREFETCH | ||
379 | ble 112f | ||
380 | li r7,MAX_COPY_PREFETCH | ||
381 | 112: mtctr r7 | ||
382 | 111: dcbt r3,r4 | ||
383 | addi r3,r3,CACHELINE_BYTES | ||
384 | bdnz 111b | ||
385 | #else | ||
386 | dcbt r3,r4 | ||
387 | addi r3,r3,CACHELINE_BYTES | ||
388 | #endif /* MAX_COPY_PREFETCH > 1 */ | ||
389 | |||
390 | 114: subf r8,r7,r0 | ||
391 | mr r0,r7 | ||
392 | mtctr r8 | ||
393 | |||
394 | 53: dcbt r3,r4 | ||
395 | 54: dcbz r11,r6 | ||
396 | .section __ex_table,"a" | ||
397 | .align 2 | ||
398 | .long 54b,105f | ||
399 | .text | ||
400 | /* the main body of the cacheline loop */ | ||
401 | COPY_16_BYTES_WITHEX(0) | ||
402 | #if L1_CACHE_LINE_SIZE >= 32 | ||
403 | COPY_16_BYTES_WITHEX(1) | ||
404 | #if L1_CACHE_LINE_SIZE >= 64 | ||
405 | COPY_16_BYTES_WITHEX(2) | ||
406 | COPY_16_BYTES_WITHEX(3) | ||
407 | #if L1_CACHE_LINE_SIZE >= 128 | ||
408 | COPY_16_BYTES_WITHEX(4) | ||
409 | COPY_16_BYTES_WITHEX(5) | ||
410 | COPY_16_BYTES_WITHEX(6) | ||
411 | COPY_16_BYTES_WITHEX(7) | ||
412 | #endif | ||
413 | #endif | ||
414 | #endif | ||
415 | bdnz 53b | ||
416 | cmpwi r0,0 | ||
417 | li r3,4 | ||
418 | li r7,0 | ||
419 | bne 114b | ||
420 | #endif /* CONFIG_8xx */ | ||
421 | |||
422 | 63: srwi. r0,r5,2 | ||
423 | mtctr r0 | ||
424 | beq 64f | ||
425 | 30: lwzu r0,4(r4) | ||
426 | 31: stwu r0,4(r6) | ||
427 | bdnz 30b | ||
428 | |||
429 | 64: andi. r0,r5,3 | ||
430 | mtctr r0 | ||
431 | beq+ 65f | ||
432 | 40: lbz r0,4(r4) | ||
433 | 41: stb r0,4(r6) | ||
434 | addi r4,r4,1 | ||
435 | addi r6,r6,1 | ||
436 | bdnz 40b | ||
437 | 65: li r3,0 | ||
438 | blr | ||
439 | |||
440 | /* read fault, initial single-byte copy */ | ||
441 | 100: li r9,0 | ||
442 | b 90f | ||
443 | /* write fault, initial single-byte copy */ | ||
444 | 101: li r9,1 | ||
445 | 90: subf r5,r8,r5 | ||
446 | li r3,0 | ||
447 | b 99f | ||
448 | /* read fault, initial word copy */ | ||
449 | 102: li r9,0 | ||
450 | b 91f | ||
451 | /* write fault, initial word copy */ | ||
452 | 103: li r9,1 | ||
453 | 91: li r3,2 | ||
454 | b 99f | ||
455 | |||
456 | /* | ||
457 | * this stuff handles faults in the cacheline loop and branches to either | ||
458 | * 104f (if in read part) or 105f (if in write part), after updating r5 | ||
459 | */ | ||
460 | COPY_16_BYTES_EXCODE(0) | ||
461 | #if L1_CACHE_LINE_SIZE >= 32 | ||
462 | COPY_16_BYTES_EXCODE(1) | ||
463 | #if L1_CACHE_LINE_SIZE >= 64 | ||
464 | COPY_16_BYTES_EXCODE(2) | ||
465 | COPY_16_BYTES_EXCODE(3) | ||
466 | #if L1_CACHE_LINE_SIZE >= 128 | ||
467 | COPY_16_BYTES_EXCODE(4) | ||
468 | COPY_16_BYTES_EXCODE(5) | ||
469 | COPY_16_BYTES_EXCODE(6) | ||
470 | COPY_16_BYTES_EXCODE(7) | ||
471 | #endif | ||
472 | #endif | ||
473 | #endif | ||
474 | |||
475 | /* read fault in cacheline loop */ | ||
476 | 104: li r9,0 | ||
477 | b 92f | ||
478 | /* fault on dcbz (effectively a write fault) */ | ||
479 | /* or write fault in cacheline loop */ | ||
480 | 105: li r9,1 | ||
481 | 92: li r3,LG_CACHELINE_BYTES | ||
482 | mfctr r8 | ||
483 | add r0,r0,r8 | ||
484 | b 106f | ||
485 | /* read fault in final word loop */ | ||
486 | 108: li r9,0 | ||
487 | b 93f | ||
488 | /* write fault in final word loop */ | ||
489 | 109: li r9,1 | ||
490 | 93: andi. r5,r5,3 | ||
491 | li r3,2 | ||
492 | b 99f | ||
493 | /* read fault in final byte loop */ | ||
494 | 110: li r9,0 | ||
495 | b 94f | ||
496 | /* write fault in final byte loop */ | ||
497 | 111: li r9,1 | ||
498 | 94: li r5,0 | ||
499 | li r3,0 | ||
500 | /* | ||
501 | * At this stage the number of bytes not copied is | ||
502 | * r5 + (ctr << r3), and r9 is 0 for read or 1 for write. | ||
503 | */ | ||
504 | 99: mfctr r0 | ||
505 | 106: slw r3,r0,r3 | ||
506 | add. r3,r3,r5 | ||
507 | beq 120f /* shouldn't happen */ | ||
508 | cmpwi 0,r9,0 | ||
509 | bne 120f | ||
510 | /* for a read fault, first try to continue the copy one byte at a time */ | ||
511 | mtctr r3 | ||
512 | 130: lbz r0,4(r4) | ||
513 | 131: stb r0,4(r6) | ||
514 | addi r4,r4,1 | ||
515 | addi r6,r6,1 | ||
516 | bdnz 130b | ||
517 | /* then clear out the destination: r3 bytes starting at 4(r6) */ | ||
518 | 132: mfctr r3 | ||
519 | srwi. r0,r3,2 | ||
520 | li r9,0 | ||
521 | mtctr r0 | ||
522 | beq 113f | ||
523 | 112: stwu r9,4(r6) | ||
524 | bdnz 112b | ||
525 | 113: andi. r0,r3,3 | ||
526 | mtctr r0 | ||
527 | beq 120f | ||
528 | 114: stb r9,4(r6) | ||
529 | addi r6,r6,1 | ||
530 | bdnz 114b | ||
531 | 120: blr | ||
532 | |||
533 | .section __ex_table,"a" | ||
534 | .align 2 | ||
535 | .long 30b,108b | ||
536 | .long 31b,109b | ||
537 | .long 40b,110b | ||
538 | .long 41b,111b | ||
539 | .long 130b,132b | ||
540 | .long 131b,120b | ||
541 | .long 112b,120b | ||
542 | .long 114b,120b | ||
543 | .text | ||