diff options
author | Anton Blanchard <anton@samba.org> | 2013-09-22 22:04:35 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2013-10-11 01:48:25 -0400 |
commit | 32ee1e188eadd7c997837649a107fd1c50feef7a (patch) | |
tree | 6d7bd0eea1b8c062845bb11610c25e24968164d2 /arch/powerpc/lib | |
parent | 8b5ede69d24db939f52b47effff2f6fe1e83e08b (diff) |
powerpc: Fix endian issues in VMX copy loops
Fix the permute loops for little endian.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r-- | arch/powerpc/lib/copyuser_power7.S | 54 | ||||
-rw-r--r-- | arch/powerpc/lib/memcpy_power7.S | 55 |
2 files changed, 63 insertions, 46 deletions
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S index d1f11795a7ad..e8e9c36dc784 100644 --- a/arch/powerpc/lib/copyuser_power7.S +++ b/arch/powerpc/lib/copyuser_power7.S | |||
@@ -19,6 +19,14 @@ | |||
19 | */ | 19 | */ |
20 | #include <asm/ppc_asm.h> | 20 | #include <asm/ppc_asm.h> |
21 | 21 | ||
22 | #ifdef __BIG_ENDIAN__ | ||
23 | #define LVS(VRT,RA,RB) lvsl VRT,RA,RB | ||
24 | #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC | ||
25 | #else | ||
26 | #define LVS(VRT,RA,RB) lvsr VRT,RA,RB | ||
27 | #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC | ||
28 | #endif | ||
29 | |||
22 | .macro err1 | 30 | .macro err1 |
23 | 100: | 31 | 100: |
24 | .section __ex_table,"a" | 32 | .section __ex_table,"a" |
@@ -552,13 +560,13 @@ err3; stw r7,4(r3) | |||
552 | li r10,32 | 560 | li r10,32 |
553 | li r11,48 | 561 | li r11,48 |
554 | 562 | ||
555 | lvsl vr16,0,r4 /* Setup permute control vector */ | 563 | LVS(vr16,0,r4) /* Setup permute control vector */ |
556 | err3; lvx vr0,0,r4 | 564 | err3; lvx vr0,0,r4 |
557 | addi r4,r4,16 | 565 | addi r4,r4,16 |
558 | 566 | ||
559 | bf cr7*4+3,5f | 567 | bf cr7*4+3,5f |
560 | err3; lvx vr1,r0,r4 | 568 | err3; lvx vr1,r0,r4 |
561 | vperm vr8,vr0,vr1,vr16 | 569 | VPERM(vr8,vr0,vr1,vr16) |
562 | addi r4,r4,16 | 570 | addi r4,r4,16 |
563 | err3; stvx vr8,r0,r3 | 571 | err3; stvx vr8,r0,r3 |
564 | addi r3,r3,16 | 572 | addi r3,r3,16 |
@@ -566,9 +574,9 @@ err3; stvx vr8,r0,r3 | |||
566 | 574 | ||
567 | 5: bf cr7*4+2,6f | 575 | 5: bf cr7*4+2,6f |
568 | err3; lvx vr1,r0,r4 | 576 | err3; lvx vr1,r0,r4 |
569 | vperm vr8,vr0,vr1,vr16 | 577 | VPERM(vr8,vr0,vr1,vr16) |
570 | err3; lvx vr0,r4,r9 | 578 | err3; lvx vr0,r4,r9 |
571 | vperm vr9,vr1,vr0,vr16 | 579 | VPERM(vr9,vr1,vr0,vr16) |
572 | addi r4,r4,32 | 580 | addi r4,r4,32 |
573 | err3; stvx vr8,r0,r3 | 581 | err3; stvx vr8,r0,r3 |
574 | err3; stvx vr9,r3,r9 | 582 | err3; stvx vr9,r3,r9 |
@@ -576,13 +584,13 @@ err3; stvx vr9,r3,r9 | |||
576 | 584 | ||
577 | 6: bf cr7*4+1,7f | 585 | 6: bf cr7*4+1,7f |
578 | err3; lvx vr3,r0,r4 | 586 | err3; lvx vr3,r0,r4 |
579 | vperm vr8,vr0,vr3,vr16 | 587 | VPERM(vr8,vr0,vr3,vr16) |
580 | err3; lvx vr2,r4,r9 | 588 | err3; lvx vr2,r4,r9 |
581 | vperm vr9,vr3,vr2,vr16 | 589 | VPERM(vr9,vr3,vr2,vr16) |
582 | err3; lvx vr1,r4,r10 | 590 | err3; lvx vr1,r4,r10 |
583 | vperm vr10,vr2,vr1,vr16 | 591 | VPERM(vr10,vr2,vr1,vr16) |
584 | err3; lvx vr0,r4,r11 | 592 | err3; lvx vr0,r4,r11 |
585 | vperm vr11,vr1,vr0,vr16 | 593 | VPERM(vr11,vr1,vr0,vr16) |
586 | addi r4,r4,64 | 594 | addi r4,r4,64 |
587 | err3; stvx vr8,r0,r3 | 595 | err3; stvx vr8,r0,r3 |
588 | err3; stvx vr9,r3,r9 | 596 | err3; stvx vr9,r3,r9 |
@@ -611,21 +619,21 @@ err3; stvx vr11,r3,r11 | |||
611 | .align 5 | 619 | .align 5 |
612 | 8: | 620 | 8: |
613 | err4; lvx vr7,r0,r4 | 621 | err4; lvx vr7,r0,r4 |
614 | vperm vr8,vr0,vr7,vr16 | 622 | VPERM(vr8,vr0,vr7,vr16) |
615 | err4; lvx vr6,r4,r9 | 623 | err4; lvx vr6,r4,r9 |
616 | vperm vr9,vr7,vr6,vr16 | 624 | VPERM(vr9,vr7,vr6,vr16) |
617 | err4; lvx vr5,r4,r10 | 625 | err4; lvx vr5,r4,r10 |
618 | vperm vr10,vr6,vr5,vr16 | 626 | VPERM(vr10,vr6,vr5,vr16) |
619 | err4; lvx vr4,r4,r11 | 627 | err4; lvx vr4,r4,r11 |
620 | vperm vr11,vr5,vr4,vr16 | 628 | VPERM(vr11,vr5,vr4,vr16) |
621 | err4; lvx vr3,r4,r12 | 629 | err4; lvx vr3,r4,r12 |
622 | vperm vr12,vr4,vr3,vr16 | 630 | VPERM(vr12,vr4,vr3,vr16) |
623 | err4; lvx vr2,r4,r14 | 631 | err4; lvx vr2,r4,r14 |
624 | vperm vr13,vr3,vr2,vr16 | 632 | VPERM(vr13,vr3,vr2,vr16) |
625 | err4; lvx vr1,r4,r15 | 633 | err4; lvx vr1,r4,r15 |
626 | vperm vr14,vr2,vr1,vr16 | 634 | VPERM(vr14,vr2,vr1,vr16) |
627 | err4; lvx vr0,r4,r16 | 635 | err4; lvx vr0,r4,r16 |
628 | vperm vr15,vr1,vr0,vr16 | 636 | VPERM(vr15,vr1,vr0,vr16) |
629 | addi r4,r4,128 | 637 | addi r4,r4,128 |
630 | err4; stvx vr8,r0,r3 | 638 | err4; stvx vr8,r0,r3 |
631 | err4; stvx vr9,r3,r9 | 639 | err4; stvx vr9,r3,r9 |
@@ -649,13 +657,13 @@ err4; stvx vr15,r3,r16 | |||
649 | 657 | ||
650 | bf cr7*4+1,9f | 658 | bf cr7*4+1,9f |
651 | err3; lvx vr3,r0,r4 | 659 | err3; lvx vr3,r0,r4 |
652 | vperm vr8,vr0,vr3,vr16 | 660 | VPERM(vr8,vr0,vr3,vr16) |
653 | err3; lvx vr2,r4,r9 | 661 | err3; lvx vr2,r4,r9 |
654 | vperm vr9,vr3,vr2,vr16 | 662 | VPERM(vr9,vr3,vr2,vr16) |
655 | err3; lvx vr1,r4,r10 | 663 | err3; lvx vr1,r4,r10 |
656 | vperm vr10,vr2,vr1,vr16 | 664 | VPERM(vr10,vr2,vr1,vr16) |
657 | err3; lvx vr0,r4,r11 | 665 | err3; lvx vr0,r4,r11 |
658 | vperm vr11,vr1,vr0,vr16 | 666 | VPERM(vr11,vr1,vr0,vr16) |
659 | addi r4,r4,64 | 667 | addi r4,r4,64 |
660 | err3; stvx vr8,r0,r3 | 668 | err3; stvx vr8,r0,r3 |
661 | err3; stvx vr9,r3,r9 | 669 | err3; stvx vr9,r3,r9 |
@@ -665,9 +673,9 @@ err3; stvx vr11,r3,r11 | |||
665 | 673 | ||
666 | 9: bf cr7*4+2,10f | 674 | 9: bf cr7*4+2,10f |
667 | err3; lvx vr1,r0,r4 | 675 | err3; lvx vr1,r0,r4 |
668 | vperm vr8,vr0,vr1,vr16 | 676 | VPERM(vr8,vr0,vr1,vr16) |
669 | err3; lvx vr0,r4,r9 | 677 | err3; lvx vr0,r4,r9 |
670 | vperm vr9,vr1,vr0,vr16 | 678 | VPERM(vr9,vr1,vr0,vr16) |
671 | addi r4,r4,32 | 679 | addi r4,r4,32 |
672 | err3; stvx vr8,r0,r3 | 680 | err3; stvx vr8,r0,r3 |
673 | err3; stvx vr9,r3,r9 | 681 | err3; stvx vr9,r3,r9 |
@@ -675,7 +683,7 @@ err3; stvx vr9,r3,r9 | |||
675 | 683 | ||
676 | 10: bf cr7*4+3,11f | 684 | 10: bf cr7*4+3,11f |
677 | err3; lvx vr1,r0,r4 | 685 | err3; lvx vr1,r0,r4 |
678 | vperm vr8,vr0,vr1,vr16 | 686 | VPERM(vr8,vr0,vr1,vr16) |
679 | addi r4,r4,16 | 687 | addi r4,r4,16 |
680 | err3; stvx vr8,r0,r3 | 688 | err3; stvx vr8,r0,r3 |
681 | addi r3,r3,16 | 689 | addi r3,r3,16 |
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S index 0663630baf3b..e4177dbea6bd 100644 --- a/arch/powerpc/lib/memcpy_power7.S +++ b/arch/powerpc/lib/memcpy_power7.S | |||
@@ -20,6 +20,15 @@ | |||
20 | #include <asm/ppc_asm.h> | 20 | #include <asm/ppc_asm.h> |
21 | 21 | ||
22 | _GLOBAL(memcpy_power7) | 22 | _GLOBAL(memcpy_power7) |
23 | |||
24 | #ifdef __BIG_ENDIAN__ | ||
25 | #define LVS(VRT,RA,RB) lvsl VRT,RA,RB | ||
26 | #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC | ||
27 | #else | ||
28 | #define LVS(VRT,RA,RB) lvsr VRT,RA,RB | ||
29 | #define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC | ||
30 | #endif | ||
31 | |||
23 | #ifdef CONFIG_ALTIVEC | 32 | #ifdef CONFIG_ALTIVEC |
24 | cmpldi r5,16 | 33 | cmpldi r5,16 |
25 | cmpldi cr1,r5,4096 | 34 | cmpldi cr1,r5,4096 |
@@ -485,13 +494,13 @@ _GLOBAL(memcpy_power7) | |||
485 | li r10,32 | 494 | li r10,32 |
486 | li r11,48 | 495 | li r11,48 |
487 | 496 | ||
488 | lvsl vr16,0,r4 /* Setup permute control vector */ | 497 | LVS(vr16,0,r4) /* Setup permute control vector */ |
489 | lvx vr0,0,r4 | 498 | lvx vr0,0,r4 |
490 | addi r4,r4,16 | 499 | addi r4,r4,16 |
491 | 500 | ||
492 | bf cr7*4+3,5f | 501 | bf cr7*4+3,5f |
493 | lvx vr1,r0,r4 | 502 | lvx vr1,r0,r4 |
494 | vperm vr8,vr0,vr1,vr16 | 503 | VPERM(vr8,vr0,vr1,vr16) |
495 | addi r4,r4,16 | 504 | addi r4,r4,16 |
496 | stvx vr8,r0,r3 | 505 | stvx vr8,r0,r3 |
497 | addi r3,r3,16 | 506 | addi r3,r3,16 |
@@ -499,9 +508,9 @@ _GLOBAL(memcpy_power7) | |||
499 | 508 | ||
500 | 5: bf cr7*4+2,6f | 509 | 5: bf cr7*4+2,6f |
501 | lvx vr1,r0,r4 | 510 | lvx vr1,r0,r4 |
502 | vperm vr8,vr0,vr1,vr16 | 511 | VPERM(vr8,vr0,vr1,vr16) |
503 | lvx vr0,r4,r9 | 512 | lvx vr0,r4,r9 |
504 | vperm vr9,vr1,vr0,vr16 | 513 | VPERM(vr9,vr1,vr0,vr16) |
505 | addi r4,r4,32 | 514 | addi r4,r4,32 |
506 | stvx vr8,r0,r3 | 515 | stvx vr8,r0,r3 |
507 | stvx vr9,r3,r9 | 516 | stvx vr9,r3,r9 |
@@ -509,13 +518,13 @@ _GLOBAL(memcpy_power7) | |||
509 | 518 | ||
510 | 6: bf cr7*4+1,7f | 519 | 6: bf cr7*4+1,7f |
511 | lvx vr3,r0,r4 | 520 | lvx vr3,r0,r4 |
512 | vperm vr8,vr0,vr3,vr16 | 521 | VPERM(vr8,vr0,vr3,vr16) |
513 | lvx vr2,r4,r9 | 522 | lvx vr2,r4,r9 |
514 | vperm vr9,vr3,vr2,vr16 | 523 | VPERM(vr9,vr3,vr2,vr16) |
515 | lvx vr1,r4,r10 | 524 | lvx vr1,r4,r10 |
516 | vperm vr10,vr2,vr1,vr16 | 525 | VPERM(vr10,vr2,vr1,vr16) |
517 | lvx vr0,r4,r11 | 526 | lvx vr0,r4,r11 |
518 | vperm vr11,vr1,vr0,vr16 | 527 | VPERM(vr11,vr1,vr0,vr16) |
519 | addi r4,r4,64 | 528 | addi r4,r4,64 |
520 | stvx vr8,r0,r3 | 529 | stvx vr8,r0,r3 |
521 | stvx vr9,r3,r9 | 530 | stvx vr9,r3,r9 |
@@ -544,21 +553,21 @@ _GLOBAL(memcpy_power7) | |||
544 | .align 5 | 553 | .align 5 |
545 | 8: | 554 | 8: |
546 | lvx vr7,r0,r4 | 555 | lvx vr7,r0,r4 |
547 | vperm vr8,vr0,vr7,vr16 | 556 | VPERM(vr8,vr0,vr7,vr16) |
548 | lvx vr6,r4,r9 | 557 | lvx vr6,r4,r9 |
549 | vperm vr9,vr7,vr6,vr16 | 558 | VPERM(vr9,vr7,vr6,vr16) |
550 | lvx vr5,r4,r10 | 559 | lvx vr5,r4,r10 |
551 | vperm vr10,vr6,vr5,vr16 | 560 | VPERM(vr10,vr6,vr5,vr16) |
552 | lvx vr4,r4,r11 | 561 | lvx vr4,r4,r11 |
553 | vperm vr11,vr5,vr4,vr16 | 562 | VPERM(vr11,vr5,vr4,vr16) |
554 | lvx vr3,r4,r12 | 563 | lvx vr3,r4,r12 |
555 | vperm vr12,vr4,vr3,vr16 | 564 | VPERM(vr12,vr4,vr3,vr16) |
556 | lvx vr2,r4,r14 | 565 | lvx vr2,r4,r14 |
557 | vperm vr13,vr3,vr2,vr16 | 566 | VPERM(vr13,vr3,vr2,vr16) |
558 | lvx vr1,r4,r15 | 567 | lvx vr1,r4,r15 |
559 | vperm vr14,vr2,vr1,vr16 | 568 | VPERM(vr14,vr2,vr1,vr16) |
560 | lvx vr0,r4,r16 | 569 | lvx vr0,r4,r16 |
561 | vperm vr15,vr1,vr0,vr16 | 570 | VPERM(vr15,vr1,vr0,vr16) |
562 | addi r4,r4,128 | 571 | addi r4,r4,128 |
563 | stvx vr8,r0,r3 | 572 | stvx vr8,r0,r3 |
564 | stvx vr9,r3,r9 | 573 | stvx vr9,r3,r9 |
@@ -582,13 +591,13 @@ _GLOBAL(memcpy_power7) | |||
582 | 591 | ||
583 | bf cr7*4+1,9f | 592 | bf cr7*4+1,9f |
584 | lvx vr3,r0,r4 | 593 | lvx vr3,r0,r4 |
585 | vperm vr8,vr0,vr3,vr16 | 594 | VPERM(vr8,vr0,vr3,vr16) |
586 | lvx vr2,r4,r9 | 595 | lvx vr2,r4,r9 |
587 | vperm vr9,vr3,vr2,vr16 | 596 | VPERM(vr9,vr3,vr2,vr16) |
588 | lvx vr1,r4,r10 | 597 | lvx vr1,r4,r10 |
589 | vperm vr10,vr2,vr1,vr16 | 598 | VPERM(vr10,vr2,vr1,vr16) |
590 | lvx vr0,r4,r11 | 599 | lvx vr0,r4,r11 |
591 | vperm vr11,vr1,vr0,vr16 | 600 | VPERM(vr11,vr1,vr0,vr16) |
592 | addi r4,r4,64 | 601 | addi r4,r4,64 |
593 | stvx vr8,r0,r3 | 602 | stvx vr8,r0,r3 |
594 | stvx vr9,r3,r9 | 603 | stvx vr9,r3,r9 |
@@ -598,9 +607,9 @@ _GLOBAL(memcpy_power7) | |||
598 | 607 | ||
599 | 9: bf cr7*4+2,10f | 608 | 9: bf cr7*4+2,10f |
600 | lvx vr1,r0,r4 | 609 | lvx vr1,r0,r4 |
601 | vperm vr8,vr0,vr1,vr16 | 610 | VPERM(vr8,vr0,vr1,vr16) |
602 | lvx vr0,r4,r9 | 611 | lvx vr0,r4,r9 |
603 | vperm vr9,vr1,vr0,vr16 | 612 | VPERM(vr9,vr1,vr0,vr16) |
604 | addi r4,r4,32 | 613 | addi r4,r4,32 |
605 | stvx vr8,r0,r3 | 614 | stvx vr8,r0,r3 |
606 | stvx vr9,r3,r9 | 615 | stvx vr9,r3,r9 |
@@ -608,7 +617,7 @@ _GLOBAL(memcpy_power7) | |||
608 | 617 | ||
609 | 10: bf cr7*4+3,11f | 618 | 10: bf cr7*4+3,11f |
610 | lvx vr1,r0,r4 | 619 | lvx vr1,r0,r4 |
611 | vperm vr8,vr0,vr1,vr16 | 620 | VPERM(vr8,vr0,vr1,vr16) |
612 | addi r4,r4,16 | 621 | addi r4,r4,16 |
613 | stvx vr8,r0,r3 | 622 | stvx vr8,r0,r3 |
614 | addi r3,r3,16 | 623 | addi r3,r3,16 |