aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib/memcpy_power7.S
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/lib/memcpy_power7.S')
-rw-r--r--arch/powerpc/lib/memcpy_power7.S647
1 files changed, 647 insertions, 0 deletions
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
new file mode 100644
index 000000000000..0efdc51bc716
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -0,0 +1,647 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22_GLOBAL(memcpy_power7)
23#ifdef CONFIG_ALTIVEC
24 cmpldi r5,16
25 cmpldi cr1,r5,4096
26
27 std r3,48(r1)
28
29 blt .Lshort_copy
30 bgt cr1,.Lvmx_copy
31#else
32 cmpldi r5,16
33
34 std r3,48(r1)
35
36 blt .Lshort_copy
37#endif
38
39.Lnonvmx_copy:
40 /* Get the source 8B aligned */
41 neg r6,r4
42 mtocrf 0x01,r6
43 clrldi r6,r6,(64-3)
44
45 bf cr7*4+3,1f
46 lbz r0,0(r4)
47 addi r4,r4,1
48 stb r0,0(r3)
49 addi r3,r3,1
50
511: bf cr7*4+2,2f
52 lhz r0,0(r4)
53 addi r4,r4,2
54 sth r0,0(r3)
55 addi r3,r3,2
56
572: bf cr7*4+1,3f
58 lwz r0,0(r4)
59 addi r4,r4,4
60 stw r0,0(r3)
61 addi r3,r3,4
62
633: sub r5,r5,r6
64 cmpldi r5,128
65 blt 5f
66
67 mflr r0
68 stdu r1,-STACKFRAMESIZE(r1)
69 std r14,STK_REG(R14)(r1)
70 std r15,STK_REG(R15)(r1)
71 std r16,STK_REG(R16)(r1)
72 std r17,STK_REG(R17)(r1)
73 std r18,STK_REG(R18)(r1)
74 std r19,STK_REG(R19)(r1)
75 std r20,STK_REG(R20)(r1)
76 std r21,STK_REG(R21)(r1)
77 std r22,STK_REG(R22)(r1)
78 std r0,STACKFRAMESIZE+16(r1)
79
80 srdi r6,r5,7
81 mtctr r6
82
83 /* Now do cacheline (128B) sized loads and stores. */
84 .align 5
854:
86 ld r0,0(r4)
87 ld r6,8(r4)
88 ld r7,16(r4)
89 ld r8,24(r4)
90 ld r9,32(r4)
91 ld r10,40(r4)
92 ld r11,48(r4)
93 ld r12,56(r4)
94 ld r14,64(r4)
95 ld r15,72(r4)
96 ld r16,80(r4)
97 ld r17,88(r4)
98 ld r18,96(r4)
99 ld r19,104(r4)
100 ld r20,112(r4)
101 ld r21,120(r4)
102 addi r4,r4,128
103 std r0,0(r3)
104 std r6,8(r3)
105 std r7,16(r3)
106 std r8,24(r3)
107 std r9,32(r3)
108 std r10,40(r3)
109 std r11,48(r3)
110 std r12,56(r3)
111 std r14,64(r3)
112 std r15,72(r3)
113 std r16,80(r3)
114 std r17,88(r3)
115 std r18,96(r3)
116 std r19,104(r3)
117 std r20,112(r3)
118 std r21,120(r3)
119 addi r3,r3,128
120 bdnz 4b
121
122 clrldi r5,r5,(64-7)
123
124 ld r14,STK_REG(R14)(r1)
125 ld r15,STK_REG(R15)(r1)
126 ld r16,STK_REG(R16)(r1)
127 ld r17,STK_REG(R17)(r1)
128 ld r18,STK_REG(R18)(r1)
129 ld r19,STK_REG(R19)(r1)
130 ld r20,STK_REG(R20)(r1)
131 ld r21,STK_REG(R21)(r1)
132 ld r22,STK_REG(R22)(r1)
133 addi r1,r1,STACKFRAMESIZE
134
135 /* Up to 127B to go */
1365: srdi r6,r5,4
137 mtocrf 0x01,r6
138
1396: bf cr7*4+1,7f
140 ld r0,0(r4)
141 ld r6,8(r4)
142 ld r7,16(r4)
143 ld r8,24(r4)
144 ld r9,32(r4)
145 ld r10,40(r4)
146 ld r11,48(r4)
147 ld r12,56(r4)
148 addi r4,r4,64
149 std r0,0(r3)
150 std r6,8(r3)
151 std r7,16(r3)
152 std r8,24(r3)
153 std r9,32(r3)
154 std r10,40(r3)
155 std r11,48(r3)
156 std r12,56(r3)
157 addi r3,r3,64
158
159 /* Up to 63B to go */
1607: bf cr7*4+2,8f
161 ld r0,0(r4)
162 ld r6,8(r4)
163 ld r7,16(r4)
164 ld r8,24(r4)
165 addi r4,r4,32
166 std r0,0(r3)
167 std r6,8(r3)
168 std r7,16(r3)
169 std r8,24(r3)
170 addi r3,r3,32
171
172 /* Up to 31B to go */
1738: bf cr7*4+3,9f
174 ld r0,0(r4)
175 ld r6,8(r4)
176 addi r4,r4,16
177 std r0,0(r3)
178 std r6,8(r3)
179 addi r3,r3,16
180
1819: clrldi r5,r5,(64-4)
182
183 /* Up to 15B to go */
184.Lshort_copy:
185 mtocrf 0x01,r5
186 bf cr7*4+0,12f
187 lwz r0,0(r4) /* Less chance of a reject with word ops */
188 lwz r6,4(r4)
189 addi r4,r4,8
190 stw r0,0(r3)
191 stw r6,4(r3)
192 addi r3,r3,8
193
19412: bf cr7*4+1,13f
195 lwz r0,0(r4)
196 addi r4,r4,4
197 stw r0,0(r3)
198 addi r3,r3,4
199
20013: bf cr7*4+2,14f
201 lhz r0,0(r4)
202 addi r4,r4,2
203 sth r0,0(r3)
204 addi r3,r3,2
205
20614: bf cr7*4+3,15f
207 lbz r0,0(r4)
208 stb r0,0(r3)
209
21015: ld r3,48(r1)
211 blr
212
213.Lunwind_stack_nonvmx_copy:
214 addi r1,r1,STACKFRAMESIZE
215 b .Lnonvmx_copy
216
217#ifdef CONFIG_ALTIVEC
218.Lvmx_copy:
219 mflr r0
220 std r4,56(r1)
221 std r5,64(r1)
222 std r0,16(r1)
223 stdu r1,-STACKFRAMESIZE(r1)
224 bl .enter_vmx_copy
225 cmpwi r3,0
226 ld r0,STACKFRAMESIZE+16(r1)
227 ld r3,STACKFRAMESIZE+48(r1)
228 ld r4,STACKFRAMESIZE+56(r1)
229 ld r5,STACKFRAMESIZE+64(r1)
230 mtlr r0
231
232 /*
233 * We prefetch both the source and destination using enhanced touch
234 * instructions. We use a stream ID of 0 for the load side and
235 * 1 for the store side.
236 */
237 clrrdi r6,r4,7
238 clrrdi r9,r3,7
239 ori r9,r9,1 /* stream=1 */
240
241 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
242 cmpldi cr1,r7,0x3FF
243 ble cr1,1f
244 li r7,0x3FF
2451: lis r0,0x0E00 /* depth=7 */
246 sldi r7,r7,7
247 or r7,r7,r0
248 ori r10,r7,1 /* stream=1 */
249
250 lis r8,0x8000 /* GO=1 */
251 clrldi r8,r8,32
252
253.machine push
254.machine "power4"
255 dcbt r0,r6,0b01000
256 dcbt r0,r7,0b01010
257 dcbtst r0,r9,0b01000
258 dcbtst r0,r10,0b01010
259 eieio
260 dcbt r0,r8,0b01010 /* GO */
261.machine pop
262
263 beq .Lunwind_stack_nonvmx_copy
264
265 /*
266 * If source and destination are not relatively aligned we use a
267 * slower permute loop.
268 */
269 xor r6,r4,r3
270 rldicl. r6,r6,0,(64-4)
271 bne .Lvmx_unaligned_copy
272
273 /* Get the destination 16B aligned */
274 neg r6,r3
275 mtocrf 0x01,r6
276 clrldi r6,r6,(64-4)
277
278 bf cr7*4+3,1f
279 lbz r0,0(r4)
280 addi r4,r4,1
281 stb r0,0(r3)
282 addi r3,r3,1
283
2841: bf cr7*4+2,2f
285 lhz r0,0(r4)
286 addi r4,r4,2
287 sth r0,0(r3)
288 addi r3,r3,2
289
2902: bf cr7*4+1,3f
291 lwz r0,0(r4)
292 addi r4,r4,4
293 stw r0,0(r3)
294 addi r3,r3,4
295
2963: bf cr7*4+0,4f
297 ld r0,0(r4)
298 addi r4,r4,8
299 std r0,0(r3)
300 addi r3,r3,8
301
3024: sub r5,r5,r6
303
304 /* Get the desination 128B aligned */
305 neg r6,r3
306 srdi r7,r6,4
307 mtocrf 0x01,r7
308 clrldi r6,r6,(64-7)
309
310 li r9,16
311 li r10,32
312 li r11,48
313
314 bf cr7*4+3,5f
315 lvx vr1,r0,r4
316 addi r4,r4,16
317 stvx vr1,r0,r3
318 addi r3,r3,16
319
3205: bf cr7*4+2,6f
321 lvx vr1,r0,r4
322 lvx vr0,r4,r9
323 addi r4,r4,32
324 stvx vr1,r0,r3
325 stvx vr0,r3,r9
326 addi r3,r3,32
327
3286: bf cr7*4+1,7f
329 lvx vr3,r0,r4
330 lvx vr2,r4,r9
331 lvx vr1,r4,r10
332 lvx vr0,r4,r11
333 addi r4,r4,64
334 stvx vr3,r0,r3
335 stvx vr2,r3,r9
336 stvx vr1,r3,r10
337 stvx vr0,r3,r11
338 addi r3,r3,64
339
3407: sub r5,r5,r6
341 srdi r6,r5,7
342
343 std r14,STK_REG(R14)(r1)
344 std r15,STK_REG(R15)(r1)
345 std r16,STK_REG(R16)(r1)
346
347 li r12,64
348 li r14,80
349 li r15,96
350 li r16,112
351
352 mtctr r6
353
354 /*
355 * Now do cacheline sized loads and stores. By this stage the
356 * cacheline stores are also cacheline aligned.
357 */
358 .align 5
3598:
360 lvx vr7,r0,r4
361 lvx vr6,r4,r9
362 lvx vr5,r4,r10
363 lvx vr4,r4,r11
364 lvx vr3,r4,r12
365 lvx vr2,r4,r14
366 lvx vr1,r4,r15
367 lvx vr0,r4,r16
368 addi r4,r4,128
369 stvx vr7,r0,r3
370 stvx vr6,r3,r9
371 stvx vr5,r3,r10
372 stvx vr4,r3,r11
373 stvx vr3,r3,r12
374 stvx vr2,r3,r14
375 stvx vr1,r3,r15
376 stvx vr0,r3,r16
377 addi r3,r3,128
378 bdnz 8b
379
380 ld r14,STK_REG(R14)(r1)
381 ld r15,STK_REG(R15)(r1)
382 ld r16,STK_REG(R16)(r1)
383
384 /* Up to 127B to go */
385 clrldi r5,r5,(64-7)
386 srdi r6,r5,4
387 mtocrf 0x01,r6
388
389 bf cr7*4+1,9f
390 lvx vr3,r0,r4
391 lvx vr2,r4,r9
392 lvx vr1,r4,r10
393 lvx vr0,r4,r11
394 addi r4,r4,64
395 stvx vr3,r0,r3
396 stvx vr2,r3,r9
397 stvx vr1,r3,r10
398 stvx vr0,r3,r11
399 addi r3,r3,64
400
4019: bf cr7*4+2,10f
402 lvx vr1,r0,r4
403 lvx vr0,r4,r9
404 addi r4,r4,32
405 stvx vr1,r0,r3
406 stvx vr0,r3,r9
407 addi r3,r3,32
408
40910: bf cr7*4+3,11f
410 lvx vr1,r0,r4
411 addi r4,r4,16
412 stvx vr1,r0,r3
413 addi r3,r3,16
414
415 /* Up to 15B to go */
41611: clrldi r5,r5,(64-4)
417 mtocrf 0x01,r5
418 bf cr7*4+0,12f
419 ld r0,0(r4)
420 addi r4,r4,8
421 std r0,0(r3)
422 addi r3,r3,8
423
42412: bf cr7*4+1,13f
425 lwz r0,0(r4)
426 addi r4,r4,4
427 stw r0,0(r3)
428 addi r3,r3,4
429
43013: bf cr7*4+2,14f
431 lhz r0,0(r4)
432 addi r4,r4,2
433 sth r0,0(r3)
434 addi r3,r3,2
435
43614: bf cr7*4+3,15f
437 lbz r0,0(r4)
438 stb r0,0(r3)
439
44015: addi r1,r1,STACKFRAMESIZE
441 ld r3,48(r1)
442 b .exit_vmx_copy /* tail call optimise */
443
444.Lvmx_unaligned_copy:
445 /* Get the destination 16B aligned */
446 neg r6,r3
447 mtocrf 0x01,r6
448 clrldi r6,r6,(64-4)
449
450 bf cr7*4+3,1f
451 lbz r0,0(r4)
452 addi r4,r4,1
453 stb r0,0(r3)
454 addi r3,r3,1
455
4561: bf cr7*4+2,2f
457 lhz r0,0(r4)
458 addi r4,r4,2
459 sth r0,0(r3)
460 addi r3,r3,2
461
4622: bf cr7*4+1,3f
463 lwz r0,0(r4)
464 addi r4,r4,4
465 stw r0,0(r3)
466 addi r3,r3,4
467
4683: bf cr7*4+0,4f
469 lwz r0,0(r4) /* Less chance of a reject with word ops */
470 lwz r7,4(r4)
471 addi r4,r4,8
472 stw r0,0(r3)
473 stw r7,4(r3)
474 addi r3,r3,8
475
4764: sub r5,r5,r6
477
478 /* Get the desination 128B aligned */
479 neg r6,r3
480 srdi r7,r6,4
481 mtocrf 0x01,r7
482 clrldi r6,r6,(64-7)
483
484 li r9,16
485 li r10,32
486 li r11,48
487
488 lvsl vr16,0,r4 /* Setup permute control vector */
489 lvx vr0,0,r4
490 addi r4,r4,16
491
492 bf cr7*4+3,5f
493 lvx vr1,r0,r4
494 vperm vr8,vr0,vr1,vr16
495 addi r4,r4,16
496 stvx vr8,r0,r3
497 addi r3,r3,16
498 vor vr0,vr1,vr1
499
5005: bf cr7*4+2,6f
501 lvx vr1,r0,r4
502 vperm vr8,vr0,vr1,vr16
503 lvx vr0,r4,r9
504 vperm vr9,vr1,vr0,vr16
505 addi r4,r4,32
506 stvx vr8,r0,r3
507 stvx vr9,r3,r9
508 addi r3,r3,32
509
5106: bf cr7*4+1,7f
511 lvx vr3,r0,r4
512 vperm vr8,vr0,vr3,vr16
513 lvx vr2,r4,r9
514 vperm vr9,vr3,vr2,vr16
515 lvx vr1,r4,r10
516 vperm vr10,vr2,vr1,vr16
517 lvx vr0,r4,r11
518 vperm vr11,vr1,vr0,vr16
519 addi r4,r4,64
520 stvx vr8,r0,r3
521 stvx vr9,r3,r9
522 stvx vr10,r3,r10
523 stvx vr11,r3,r11
524 addi r3,r3,64
525
5267: sub r5,r5,r6
527 srdi r6,r5,7
528
529 std r14,STK_REG(R14)(r1)
530 std r15,STK_REG(R15)(r1)
531 std r16,STK_REG(R16)(r1)
532
533 li r12,64
534 li r14,80
535 li r15,96
536 li r16,112
537
538 mtctr r6
539
540 /*
541 * Now do cacheline sized loads and stores. By this stage the
542 * cacheline stores are also cacheline aligned.
543 */
544 .align 5
5458:
546 lvx vr7,r0,r4
547 vperm vr8,vr0,vr7,vr16
548 lvx vr6,r4,r9
549 vperm vr9,vr7,vr6,vr16
550 lvx vr5,r4,r10
551 vperm vr10,vr6,vr5,vr16
552 lvx vr4,r4,r11
553 vperm vr11,vr5,vr4,vr16
554 lvx vr3,r4,r12
555 vperm vr12,vr4,vr3,vr16
556 lvx vr2,r4,r14
557 vperm vr13,vr3,vr2,vr16
558 lvx vr1,r4,r15
559 vperm vr14,vr2,vr1,vr16
560 lvx vr0,r4,r16
561 vperm vr15,vr1,vr0,vr16
562 addi r4,r4,128
563 stvx vr8,r0,r3
564 stvx vr9,r3,r9
565 stvx vr10,r3,r10
566 stvx vr11,r3,r11
567 stvx vr12,r3,r12
568 stvx vr13,r3,r14
569 stvx vr14,r3,r15
570 stvx vr15,r3,r16
571 addi r3,r3,128
572 bdnz 8b
573
574 ld r14,STK_REG(R14)(r1)
575 ld r15,STK_REG(R15)(r1)
576 ld r16,STK_REG(R16)(r1)
577
578 /* Up to 127B to go */
579 clrldi r5,r5,(64-7)
580 srdi r6,r5,4
581 mtocrf 0x01,r6
582
583 bf cr7*4+1,9f
584 lvx vr3,r0,r4
585 vperm vr8,vr0,vr3,vr16
586 lvx vr2,r4,r9
587 vperm vr9,vr3,vr2,vr16
588 lvx vr1,r4,r10
589 vperm vr10,vr2,vr1,vr16
590 lvx vr0,r4,r11
591 vperm vr11,vr1,vr0,vr16
592 addi r4,r4,64
593 stvx vr8,r0,r3
594 stvx vr9,r3,r9
595 stvx vr10,r3,r10
596 stvx vr11,r3,r11
597 addi r3,r3,64
598
5999: bf cr7*4+2,10f
600 lvx vr1,r0,r4
601 vperm vr8,vr0,vr1,vr16
602 lvx vr0,r4,r9
603 vperm vr9,vr1,vr0,vr16
604 addi r4,r4,32
605 stvx vr8,r0,r3
606 stvx vr9,r3,r9
607 addi r3,r3,32
608
60910: bf cr7*4+3,11f
610 lvx vr1,r0,r4
611 vperm vr8,vr0,vr1,vr16
612 addi r4,r4,16
613 stvx vr8,r0,r3
614 addi r3,r3,16
615
616 /* Up to 15B to go */
61711: clrldi r5,r5,(64-4)
618 addi r4,r4,-16 /* Unwind the +16 load offset */
619 mtocrf 0x01,r5
620 bf cr7*4+0,12f
621 lwz r0,0(r4) /* Less chance of a reject with word ops */
622 lwz r6,4(r4)
623 addi r4,r4,8
624 stw r0,0(r3)
625 stw r6,4(r3)
626 addi r3,r3,8
627
62812: bf cr7*4+1,13f
629 lwz r0,0(r4)
630 addi r4,r4,4
631 stw r0,0(r3)
632 addi r3,r3,4
633
63413: bf cr7*4+2,14f
635 lhz r0,0(r4)
636 addi r4,r4,2
637 sth r0,0(r3)
638 addi r3,r3,2
639
64014: bf cr7*4+3,15f
641 lbz r0,0(r4)
642 stb r0,0(r3)
643
64415: addi r1,r1,STACKFRAMESIZE
645 ld r3,48(r1)
646 b .exit_vmx_copy /* tail call optimise */
647#endif /* CONFiG_ALTIVEC */