14 files changed, 1199 insertions, 86 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 7735a2c2e6d9..746e0c895cd7 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -17,14 +17,15 @@ obj-$(CONFIG_HAS_IOMEM)	+= devres.o
 obj-$(CONFIG_PPC64)     += copypage_64.o copyuser_64.o \
                           memcpy_64.o usercopy_64.o mem_64.o string.o \
                           checksum_wrappers_64.o hweight_64.o \
-                           copyuser_power7.o
+                           copyuser_power7.o string_64.o copypage_power7.o \
+                           memcpy_power7.o
 obj-$(CONFIG_XMON)      += sstep.o ldstfp.o
 obj-$(CONFIG_KPROBES)   += sstep.o ldstfp.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)        += sstep.o ldstfp.o
 ifeq ($(CONFIG_PPC64),y)
 obj-$(CONFIG_SMP)       += locks.o
-obj-$(CONFIG_ALTIVEC)   += copyuser_power7_vmx.o
+obj-$(CONFIG_ALTIVEC)   += vmx-helper.o
 endif
 obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 18245af38aea..167f72555d60 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -65,9 +65,6 @@ _GLOBAL(csum_tcpudp_magic)
        srwi    r3,r3,16
        blr
-#define STACKFRAMESIZE 256
-#define STK_REG(i)      (112 + ((i)-14)*8)
 /*
 * Computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit).
@@ -114,9 +111,9 @@ _GLOBAL(csum_partial)
        mtctr   r6
        stdu    r1,-STACKFRAMESIZE(r1)
-        std     r14,STK_REG(r14)(r1)
+        std     r14,STK_REG(R14)(r1)
-        std     r15,STK_REG(r15)(r1)
+        std     r15,STK_REG(R15)(r1)
-        std     r16,STK_REG(r16)(r1)
+        std     r16,STK_REG(R16)(r1)
        ld      r6,0(r3)
        ld      r9,8(r3)
@@ -175,9 +172,9 @@ _GLOBAL(csum_partial)
        adde    r0,r0,r15
        adde    r0,r0,r16
-        ld      r14,STK_REG(r14)(r1)
+        ld      r14,STK_REG(R14)(r1)
-        ld      r15,STK_REG(r15)(r1)
+        ld      r15,STK_REG(R15)(r1)
-        ld      r16,STK_REG(r16)(r1)
+        ld      r16,STK_REG(R16)(r1)
        addi    r1,r1,STACKFRAMESIZE
        andi.   r4,r4,63
@@ -299,9 +296,9 @@ dest;	sth	r6,0(r4)
        mtctr   r6
        stdu    r1,-STACKFRAMESIZE(r1)
-        std     r14,STK_REG(r14)(r1)
+        std     r14,STK_REG(R14)(r1)
-        std     r15,STK_REG(r15)(r1)
+        std     r15,STK_REG(R15)(r1)
-        std     r16,STK_REG(r16)(r1)
+        std     r16,STK_REG(R16)(r1)
 source; ld      r6,0(r3)
 source; ld      r9,8(r3)
@@ -382,9 +379,9 @@ dest;	std	r16,56(r4)
        adde    r0,r0,r15
        adde    r0,r0,r16
-        ld      r14,STK_REG(r14)(r1)
+        ld      r14,STK_REG(R14)(r1)
-        ld      r15,STK_REG(r15)(r1)
+        ld      r15,STK_REG(R15)(r1)
-        ld      r16,STK_REG(r16)(r1)
+        ld      r16,STK_REG(R16)(r1)
        addi    r1,r1,STACKFRAMESIZE
        andi.   r5,r5,63
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 7c975d43e3f3..dd223b3eb333 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -13,17 +13,23 @@
 #include <linux/mm.h>
 #include <asm/page.h>
 #include <asm/code-patching.h>
+#include <asm/uaccess.h>
-void patch_instruction(unsigned int *addr, unsigned int instr)
+int patch_instruction(unsigned int *addr, unsigned int instr)
 {
-        *addr = instr;
+        int err;
+        err = __put_user(instr, addr);
+        if (err)
+                return err;
        asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr));
+        return 0;
 }
-void patch_branch(unsigned int *addr, unsigned long target, int flags)
+int patch_branch(unsigned int *addr, unsigned long target, int flags)
 {
-        patch_instruction(addr, create_branch(addr, target, flags));
+        return patch_instruction(addr, create_branch(addr, target, flags));
 }
 unsigned int create_branch(const unsigned int *addr,
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 53dcb6b1b708..9f9434a85264 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -17,7 +17,11 @@ PPC64_CACHES:
        .section        ".text"
 _GLOBAL(copy_page)
+BEGIN_FTR_SECTION
        lis     r5,PAGE_SIZE@h
+FTR_SECTION_ELSE
+        b       .copypage_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
        ori     r5,r5,PAGE_SIZE@l
 BEGIN_FTR_SECTION
        ld      r10,PPC64_CACHES@toc(r2)
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
new file mode 100644
index 000000000000..0ef75bf0695c
--- /dev/null
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -0,0 +1,165 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/page.h>
+#include <asm/ppc_asm.h>
+_GLOBAL(copypage_power7)
+        /*
+         * We prefetch both the source and destination using enhanced touch
+         * instructions. We use a stream ID of 0 for the load side and
+         * 1 for the store side. Since source and destination are page
+         * aligned we don't need to clear the bottom 7 bits of either
+         * address.
+         */
+        ori     r9,r3,1         /* stream=1 */
+#ifdef CONFIG_PPC_64K_PAGES
+        lis     r7,0x0E01       /* depth=7, units=512 */
+#else
+        lis     r7,0x0E00       /* depth=7 */
+        ori     r7,r7,0x1000    /* units=32 */
+#endif
+        ori     r10,r7,1        /* stream=1 */
+        lis     r8,0x8000       /* GO=1 */
+        clrldi  r8,r8,32
+.machine push
+.machine "power4"
+        dcbt    r0,r4,0b01000
+        dcbt    r0,r7,0b01010
+        dcbtst  r0,r9,0b01000
+        dcbtst  r0,r10,0b01010
+        eieio
+        dcbt    r0,r8,0b01010   /* GO */
+.machine pop
+#ifdef CONFIG_ALTIVEC
+        mflr    r0
+        std     r3,48(r1)
+        std     r4,56(r1)
+        std     r0,16(r1)
+        stdu    r1,-STACKFRAMESIZE(r1)
+        bl      .enter_vmx_copy
+        cmpwi   r3,0
+        ld      r0,STACKFRAMESIZE+16(r1)
+        ld      r3,STACKFRAMESIZE+48(r1)
+        ld      r4,STACKFRAMESIZE+56(r1)
+        mtlr    r0
+        li      r0,(PAGE_SIZE/128)
+        mtctr   r0
+        beq     .Lnonvmx_copy
+        addi    r1,r1,STACKFRAMESIZE
+        li      r6,16
+        li      r7,32
+        li      r8,48
+        li      r9,64
+        li      r10,80
+        li      r11,96
+        li      r12,112
+        .align  5
+1:      lvx     vr7,r0,r4
+        lvx     vr6,r4,r6
+        lvx     vr5,r4,r7
+        lvx     vr4,r4,r8
+        lvx     vr3,r4,r9
+        lvx     vr2,r4,r10
+        lvx     vr1,r4,r11
+        lvx     vr0,r4,r12
+        addi    r4,r4,128
+        stvx    vr7,r0,r3
+        stvx    vr6,r3,r6
+        stvx    vr5,r3,r7
+        stvx    vr4,r3,r8
+        stvx    vr3,r3,r9
+        stvx    vr2,r3,r10
+        stvx    vr1,r3,r11
+        stvx    vr0,r3,r12
+        addi    r3,r3,128
+        bdnz    1b
+        b       .exit_vmx_copy          /* tail call optimise */
+#else
+        li      r0,(PAGE_SIZE/128)
+        mtctr   r0
+        stdu    r1,-STACKFRAMESIZE(r1)
+#endif
+.Lnonvmx_copy:
+        std     r14,STK_REG(R14)(r1)
+        std     r15,STK_REG(R15)(r1)
+        std     r16,STK_REG(R16)(r1)
+        std     r17,STK_REG(R17)(r1)
+        std     r18,STK_REG(R18)(r1)
+        std     r19,STK_REG(R19)(r1)
+        std     r20,STK_REG(R20)(r1)
+1:      ld      r0,0(r4)
+        ld      r5,8(r4)
+        ld      r6,16(r4)
+        ld      r7,24(r4)
+        ld      r8,32(r4)
+        ld      r9,40(r4)
+        ld      r10,48(r4)
+        ld      r11,56(r4)
+        ld      r12,64(r4)
+        ld      r14,72(r4)
+        ld      r15,80(r4)
+        ld      r16,88(r4)
+        ld      r17,96(r4)
+        ld      r18,104(r4)
+        ld      r19,112(r4)
+        ld      r20,120(r4)
+        addi    r4,r4,128
+        std     r0,0(r3)
+        std     r5,8(r3)
+        std     r6,16(r3)
+        std     r7,24(r3)
+        std     r8,32(r3)
+        std     r9,40(r3)
+        std     r10,48(r3)
+        std     r11,56(r3)
+        std     r12,64(r3)
+        std     r14,72(r3)
+        std     r15,80(r3)
+        std     r16,88(r3)
+        std     r17,96(r3)
+        std     r18,104(r3)
+        std     r19,112(r3)
+        std     r20,120(r3)
+        addi    r3,r3,128
+        bdnz    1b
+        ld      r14,STK_REG(R14)(r1)
+        ld      r15,STK_REG(R15)(r1)
+        ld      r16,STK_REG(R16)(r1)
+        ld      r17,STK_REG(R17)(r1)
+        ld      r18,STK_REG(R18)(r1)
+        ld      r19,STK_REG(R19)(r1)
+        ld      r20,STK_REG(R20)(r1)
+        addi    r1,r1,STACKFRAMESIZE
+        blr
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 497db7b23bb1..f9ede7c6606e 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -19,9 +19,6 @@
 */
 #include <asm/ppc_asm.h>
-#define STACKFRAMESIZE  256
-#define STK_REG(i)      (112 + ((i)-14)*8)
        .macro err1
 100:
        .section __ex_table,"a"
@@ -57,26 +54,26 @@
 .Ldo_err4:
-        ld      r16,STK_REG(r16)(r1)
+        ld      r16,STK_REG(R16)(r1)
-        ld      r15,STK_REG(r15)(r1)
+        ld      r15,STK_REG(R15)(r1)
-        ld      r14,STK_REG(r14)(r1)
+        ld      r14,STK_REG(R14)(r1)
 .Ldo_err3:
-        bl      .exit_vmx_copy
+        bl      .exit_vmx_usercopy
        ld      r0,STACKFRAMESIZE+16(r1)
        mtlr    r0
        b       .Lexit
 #endif /* CONFIG_ALTIVEC */
 .Ldo_err2:
-        ld      r22,STK_REG(r22)(r1)
+        ld      r22,STK_REG(R22)(r1)
-        ld      r21,STK_REG(r21)(r1)
+        ld      r21,STK_REG(R21)(r1)
-        ld      r20,STK_REG(r20)(r1)
+        ld      r20,STK_REG(R20)(r1)
-        ld      r19,STK_REG(r19)(r1)
+        ld      r19,STK_REG(R19)(r1)
-        ld      r18,STK_REG(r18)(r1)
+        ld      r18,STK_REG(R18)(r1)
-        ld      r17,STK_REG(r17)(r1)
+        ld      r17,STK_REG(R17)(r1)
-        ld      r16,STK_REG(r16)(r1)
+        ld      r16,STK_REG(R16)(r1)
-        ld      r15,STK_REG(r15)(r1)
+        ld      r15,STK_REG(R15)(r1)
-        ld      r14,STK_REG(r14)(r1)
+        ld      r14,STK_REG(R14)(r1)
 .Lexit:
        addi    r1,r1,STACKFRAMESIZE
 .Ldo_err1:
@@ -137,15 +134,15 @@ err1;	stw	r0,0(r3)
        mflr    r0
        stdu    r1,-STACKFRAMESIZE(r1)
-        std     r14,STK_REG(r14)(r1)
+        std     r14,STK_REG(R14)(r1)
-        std     r15,STK_REG(r15)(r1)
+        std     r15,STK_REG(R15)(r1)
-        std     r16,STK_REG(r16)(r1)
+        std     r16,STK_REG(R16)(r1)
-        std     r17,STK_REG(r17)(r1)
+        std     r17,STK_REG(R17)(r1)
-        std     r18,STK_REG(r18)(r1)
+        std     r18,STK_REG(R18)(r1)
-        std     r19,STK_REG(r19)(r1)
+        std     r19,STK_REG(R19)(r1)
-        std     r20,STK_REG(r20)(r1)
+        std     r20,STK_REG(R20)(r1)
-        std     r21,STK_REG(r21)(r1)
+        std     r21,STK_REG(R21)(r1)
-        std     r22,STK_REG(r22)(r1)
+        std     r22,STK_REG(R22)(r1)
        std     r0,STACKFRAMESIZE+16(r1)
        srdi    r6,r5,7
@@ -192,15 +189,15 @@ err2;	std	r21,120(r3)
        clrldi  r5,r5,(64-7)
-        ld      r14,STK_REG(r14)(r1)
+        ld      r14,STK_REG(R14)(r1)
-        ld      r15,STK_REG(r15)(r1)
+        ld      r15,STK_REG(R15)(r1)
-        ld      r16,STK_REG(r16)(r1)
+        ld      r16,STK_REG(R16)(r1)
-        ld      r17,STK_REG(r17)(r1)
+        ld      r17,STK_REG(R17)(r1)
-        ld      r18,STK_REG(r18)(r1)
+        ld      r18,STK_REG(R18)(r1)
-        ld      r19,STK_REG(r19)(r1)
+        ld      r19,STK_REG(R19)(r1)
-        ld      r20,STK_REG(r20)(r1)
+        ld      r20,STK_REG(R20)(r1)
-        ld      r21,STK_REG(r21)(r1)
+        ld      r21,STK_REG(R21)(r1)
-        ld      r22,STK_REG(r22)(r1)
+        ld      r22,STK_REG(R22)(r1)
        addi    r1,r1,STACKFRAMESIZE
        /* Up to 127B to go */
@@ -290,7 +287,7 @@ err1;	stb	r0,0(r3)
        mflr    r0
        std     r0,16(r1)
        stdu    r1,-STACKFRAMESIZE(r1)
-        bl      .enter_vmx_copy
+        bl      .enter_vmx_usercopy
        cmpwi   r3,0
        ld      r0,STACKFRAMESIZE+16(r1)
        ld      r3,STACKFRAMESIZE+48(r1)
@@ -298,6 +295,68 @@ err1;	stb	r0,0(r3)
        ld      r5,STACKFRAMESIZE+64(r1)
        mtlr    r0
+        /*
+         * We prefetch both the source and destination using enhanced touch
+         * instructions. We use a stream ID of 0 for the load side and
+         * 1 for the store side.
+         */
+        clrrdi  r6,r4,7
+        clrrdi  r9,r3,7
+        ori     r9,r9,1         /* stream=1 */
+        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
+        cmpldi  r7,0x3FF
+        ble     1f
+        li      r7,0x3FF
+1:      lis     r0,0x0E00       /* depth=7 */
+        sldi    r7,r7,7
+        or      r7,r7,r0
+        ori     r10,r7,1        /* stream=1 */
+        lis     r8,0x8000       /* GO=1 */
+        clrldi  r8,r8,32
+.machine push
+.machine "power4"
+        dcbt    r0,r6,0b01000
+        dcbt    r0,r7,0b01010
+        dcbtst  r0,r9,0b01000
+        dcbtst  r0,r10,0b01010
+        eieio
+        dcbt    r0,r8,0b01010   /* GO */
+.machine pop
+        /*
+         * We prefetch both the source and destination using enhanced touch
+         * instructions. We use a stream ID of 0 for the load side and
+         * 1 for the store side.
+         */
+        clrrdi  r6,r4,7
+        clrrdi  r9,r3,7
+        ori     r9,r9,1         /* stream=1 */
+        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
+        cmpldi  cr1,r7,0x3FF
+        ble     cr1,1f
+        li      r7,0x3FF
+1:      lis     r0,0x0E00       /* depth=7 */
+        sldi    r7,r7,7
+        or      r7,r7,r0
+        ori     r10,r7,1        /* stream=1 */
+        lis     r8,0x8000       /* GO=1 */
+        clrldi  r8,r8,32
+.machine push
+.machine "power4"
+        dcbt    r0,r6,0b01000
+        dcbt    r0,r7,0b01010
+        dcbtst  r0,r9,0b01000
+        dcbtst  r0,r10,0b01010
+        eieio
+        dcbt    r0,r8,0b01010   /* GO */
+.machine pop
        beq     .Lunwind_stack_nonvmx_copy
        /*
@@ -378,9 +437,9 @@ err3;	stvx	vr0,r3,r11
 7:      sub     r5,r5,r6
        srdi    r6,r5,7
-        std     r14,STK_REG(r14)(r1)
+        std     r14,STK_REG(R14)(r1)
-        std     r15,STK_REG(r15)(r1)
+        std     r15,STK_REG(R15)(r1)
-        std     r16,STK_REG(r16)(r1)
+        std     r16,STK_REG(R16)(r1)
        li      r12,64
        li      r14,80
@@ -415,9 +474,9 @@ err4;	stvx	vr0,r3,r16
        addi    r3,r3,128
        bdnz    8b
-        ld      r14,STK_REG(r14)(r1)
+        ld      r14,STK_REG(R14)(r1)
-        ld      r15,STK_REG(r15)(r1)
+        ld      r15,STK_REG(R15)(r1)
-        ld      r16,STK_REG(r16)(r1)
+        ld      r16,STK_REG(R16)(r1)
        /* Up to 127B to go */
        clrldi  r5,r5,(64-7)
@@ -476,7 +535,7 @@ err3;	lbz	r0,0(r4)
 err3;   stb     r0,0(r3)
 15:     addi    r1,r1,STACKFRAMESIZE
-        b       .exit_vmx_copy          /* tail call optimise */
+        b       .exit_vmx_usercopy      /* tail call optimise */
 .Lvmx_unaligned_copy:
        /* Get the destination 16B aligned */
@@ -563,9 +622,9 @@ err3;	stvx	vr11,r3,r11
 7:      sub     r5,r5,r6
        srdi    r6,r5,7
-        std     r14,STK_REG(r14)(r1)
+        std     r14,STK_REG(R14)(r1)
-        std     r15,STK_REG(r15)(r1)
+        std     r15,STK_REG(R15)(r1)
-        std     r16,STK_REG(r16)(r1)
+        std     r16,STK_REG(R16)(r1)
        li      r12,64
        li      r14,80
@@ -608,9 +667,9 @@ err4;	stvx	vr15,r3,r16
        addi    r3,r3,128
        bdnz    8b
-        ld      r14,STK_REG(r14)(r1)
+        ld      r14,STK_REG(R14)(r1)
-        ld      r15,STK_REG(r15)(r1)
+        ld      r15,STK_REG(R15)(r1)
-        ld      r16,STK_REG(r16)(r1)
+        ld      r16,STK_REG(R16)(r1)
        /* Up to 127B to go */
        clrldi  r5,r5,(64-7)
@@ -679,5 +738,5 @@ err3;	lbz	r0,0(r4)
 err3;   stb     r0,0(r3)
 15:     addi    r1,r1,STACKFRAMESIZE
-        b       .exit_vmx_copy          /* tail call optimise */
+        b       .exit_vmx_usercopy      /* tail call optimise */
 #endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S
index 1c893f05d224..b2c68ce139ae 100644
--- a/arch/powerpc/lib/crtsavres.S
+++ b/arch/powerpc/lib/crtsavres.S
@@ -41,12 +41,13 @@
 #include <asm/ppc_asm.h>
        .file   "crtsavres.S"
-        .section ".text"
 #ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 #ifndef CONFIG_PPC64
+        .section ".text"
 /* Routines for saving integer registers, called by the compiler.  */
 /* Called with r11 pointing to the stack header word of the caller of the */
 /* function, just beyond the end of the integer save area.  */
@@ -232,6 +233,8 @@ _GLOBAL(_rest32gpr_31_x)
 #else /* CONFIG_PPC64 */
+        .section ".text.save.restore","ax",@progbits
 .globl  _savegpr0_14
 _savegpr0_14:
        std     r14,-144(r1)
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
index fda27868cf8c..9b96ff2ecd4d 100644
--- a/arch/powerpc/lib/hweight_64.S
+++ b/arch/powerpc/lib/hweight_64.S
@@ -28,7 +28,7 @@ BEGIN_FTR_SECTION
        nop
        nop
 FTR_SECTION_ELSE
-        PPC_POPCNTB(r3,r3)
+        PPC_POPCNTB(R3,R3)
        clrldi  r3,r3,64-8
        blr
 ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
@@ -42,14 +42,14 @@ BEGIN_FTR_SECTION
        nop
 FTR_SECTION_ELSE
  BEGIN_FTR_SECTION_NESTED(50)
-        PPC_POPCNTB(r3,r3)
+        PPC_POPCNTB(R3,R3)
        srdi    r4,r3,8
        add     r3,r4,r3
        clrldi  r3,r3,64-8
        blr
  FTR_SECTION_ELSE_NESTED(50)
        clrlwi  r3,r3,16
-        PPC_POPCNTW(r3,r3)
+        PPC_POPCNTW(R3,R3)
        clrldi  r3,r3,64-8
        blr
  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50)
@@ -66,7 +66,7 @@ BEGIN_FTR_SECTION
        nop
 FTR_SECTION_ELSE
  BEGIN_FTR_SECTION_NESTED(51)
-        PPC_POPCNTB(r3,r3)
+        PPC_POPCNTB(R3,R3)
        srdi    r4,r3,16
        add     r3,r4,r3
        srdi    r4,r3,8
@@ -74,7 +74,7 @@ FTR_SECTION_ELSE
        clrldi  r3,r3,64-8
        blr
  FTR_SECTION_ELSE_NESTED(51)
-        PPC_POPCNTW(r3,r3)
+        PPC_POPCNTW(R3,R3)
        clrldi  r3,r3,64-8
        blr
  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51)
@@ -93,7 +93,7 @@ BEGIN_FTR_SECTION
        nop
 FTR_SECTION_ELSE
  BEGIN_FTR_SECTION_NESTED(52)
-        PPC_POPCNTB(r3,r3)
+        PPC_POPCNTB(R3,R3)
        srdi    r4,r3,32
        add     r3,r4,r3
        srdi    r4,r3,16
@@ -103,7 +103,7 @@ FTR_SECTION_ELSE
        clrldi  r3,r3,64-8
        blr
  FTR_SECTION_ELSE_NESTED(52)
-        PPC_POPCNTD(r3,r3)
+        PPC_POPCNTD(R3,R3)
        clrldi  r3,r3,64-8
        blr
  ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52)
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index 6a85380520b6..85aec08ab234 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -330,13 +330,13 @@ _GLOBAL(do_lxvd2x)
        MTMSRD(r7)
        isync
        beq     cr7,1f
-        STXVD2X(0,r1,r8)
+        STXVD2X(0,R1,R8)
 1:      li      r9,-EFAULT
-2:      LXVD2X(0,0,r4)
+2:      LXVD2X(0,R0,R4)
        li      r9,0
 3:      beq     cr7,4f
        bl      put_vsr
-        LXVD2X(0,r1,r8)
+        LXVD2X(0,R1,R8)
 4:      PPC_LL  r0,STKFRM+PPC_LR_STKOFF(r1)
        mtlr    r0
        MTMSRD(r6)
@@ -358,13 +358,13 @@ _GLOBAL(do_stxvd2x)
        MTMSRD(r7)
        isync
        beq     cr7,1f
-        STXVD2X(0,r1,r8)
+        STXVD2X(0,R1,R8)
        bl      get_vsr
 1:      li      r9,-EFAULT
-2:      STXVD2X(0,0,r4)
+2:      STXVD2X(0,R0,R4)
        li      r9,0
 3:      beq     cr7,4f
-        LXVD2X(0,r1,r8)
+        LXVD2X(0,R1,R8)
 4:      PPC_LL  r0,STKFRM+PPC_LR_STKOFF(r1)
        mtlr    r0
        MTMSRD(r6)
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 82fea3963e15..d2bbbc8d7dc0 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -11,7 +11,11 @@
        .align  7
 _GLOBAL(memcpy)
+BEGIN_FTR_SECTION
        std     r3,48(r1)       /* save destination pointer for return value */
+FTR_SECTION_ELSE
+        b       memcpy_power7
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
        PPC_MTOCRF(0x01,r5)
        cmpldi  cr1,r5,16
        neg     r6,r3           # LS 3 bits = # bytes to 8-byte dest bdry
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
new file mode 100644
index 000000000000..0efdc51bc716
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -0,0 +1,647 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+_GLOBAL(memcpy_power7)
+#ifdef CONFIG_ALTIVEC
+        cmpldi  r5,16
+        cmpldi  cr1,r5,4096
+        std     r3,48(r1)
+        blt     .Lshort_copy
+        bgt     cr1,.Lvmx_copy
+#else
+        cmpldi  r5,16
+        std     r3,48(r1)
+        blt     .Lshort_copy
+#endif
+.Lnonvmx_copy:
+        /* Get the source 8B aligned */
+        neg     r6,r4
+        mtocrf  0x01,r6
+        clrldi  r6,r6,(64-3)
+        bf      cr7*4+3,1f
+        lbz     r0,0(r4)
+        addi    r4,r4,1
+        stb     r0,0(r3)
+        addi    r3,r3,1
+1:      bf      cr7*4+2,2f
+        lhz     r0,0(r4)
+        addi    r4,r4,2
+        sth     r0,0(r3)
+        addi    r3,r3,2
+2:      bf      cr7*4+1,3f
+        lwz     r0,0(r4)
+        addi    r4,r4,4
+        stw     r0,0(r3)
+        addi    r3,r3,4
+3:      sub     r5,r5,r6
+        cmpldi  r5,128
+        blt     5f
+        mflr    r0
+        stdu    r1,-STACKFRAMESIZE(r1)
+        std     r14,STK_REG(R14)(r1)
+        std     r15,STK_REG(R15)(r1)
+        std     r16,STK_REG(R16)(r1)
+        std     r17,STK_REG(R17)(r1)
+        std     r18,STK_REG(R18)(r1)
+        std     r19,STK_REG(R19)(r1)
+        std     r20,STK_REG(R20)(r1)
+        std     r21,STK_REG(R21)(r1)
+        std     r22,STK_REG(R22)(r1)
+        std     r0,STACKFRAMESIZE+16(r1)
+        srdi    r6,r5,7
+        mtctr   r6
+        /* Now do cacheline (128B) sized loads and stores. */
+        .align  5
+4:
+        ld      r0,0(r4)
+        ld      r6,8(r4)
+        ld      r7,16(r4)
+        ld      r8,24(r4)
+        ld      r9,32(r4)
+        ld      r10,40(r4)
+        ld      r11,48(r4)
+        ld      r12,56(r4)
+        ld      r14,64(r4)
+        ld      r15,72(r4)
+        ld      r16,80(r4)
+        ld      r17,88(r4)
+        ld      r18,96(r4)
+        ld      r19,104(r4)
+        ld      r20,112(r4)
+        ld      r21,120(r4)
+        addi    r4,r4,128
+        std     r0,0(r3)
+        std     r6,8(r3)
+        std     r7,16(r3)
+        std     r8,24(r3)
+        std     r9,32(r3)
+        std     r10,40(r3)
+        std     r11,48(r3)
+        std     r12,56(r3)
+        std     r14,64(r3)
+        std     r15,72(r3)
+        std     r16,80(r3)
+        std     r17,88(r3)
+        std     r18,96(r3)
+        std     r19,104(r3)
+        std     r20,112(r3)
+        std     r21,120(r3)
+        addi    r3,r3,128
+        bdnz    4b
+        clrldi  r5,r5,(64-7)
+        ld      r14,STK_REG(R14)(r1)
+        ld      r15,STK_REG(R15)(r1)
+        ld      r16,STK_REG(R16)(r1)
+        ld      r17,STK_REG(R17)(r1)
+        ld      r18,STK_REG(R18)(r1)
+        ld      r19,STK_REG(R19)(r1)
+        ld      r20,STK_REG(R20)(r1)
+        ld      r21,STK_REG(R21)(r1)
+        ld      r22,STK_REG(R22)(r1)
+        addi    r1,r1,STACKFRAMESIZE
+        /* Up to 127B to go */
+5:      srdi    r6,r5,4
+        mtocrf  0x01,r6
+6:      bf      cr7*4+1,7f
+        ld      r0,0(r4)
+        ld      r6,8(r4)
+        ld      r7,16(r4)
+        ld      r8,24(r4)
+        ld      r9,32(r4)
+        ld      r10,40(r4)
+        ld      r11,48(r4)
+        ld      r12,56(r4)
+        addi    r4,r4,64
+        std     r0,0(r3)
+        std     r6,8(r3)
+        std     r7,16(r3)
+        std     r8,24(r3)
+        std     r9,32(r3)
+        std     r10,40(r3)
+        std     r11,48(r3)
+        std     r12,56(r3)
+        addi    r3,r3,64
+        /* Up to 63B to go */
+7:      bf      cr7*4+2,8f
+        ld      r0,0(r4)
+        ld      r6,8(r4)
+        ld      r7,16(r4)
+        ld      r8,24(r4)
+        addi    r4,r4,32
+        std     r0,0(r3)
+        std     r6,8(r3)
+        std     r7,16(r3)
+        std     r8,24(r3)
+        addi    r3,r3,32
+        /* Up to 31B to go */
+8:      bf      cr7*4+3,9f
+        ld      r0,0(r4)
+        ld      r6,8(r4)
+        addi    r4,r4,16
+        std     r0,0(r3)
+        std     r6,8(r3)
+        addi    r3,r3,16
+9:      clrldi  r5,r5,(64-4)
+        /* Up to 15B to go */
+.Lshort_copy:
+        mtocrf  0x01,r5
+        bf      cr7*4+0,12f
+        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
+        lwz     r6,4(r4)
+        addi    r4,r4,8
+        stw     r0,0(r3)
+        stw     r6,4(r3)
+        addi    r3,r3,8
+12:     bf      cr7*4+1,13f
+        lwz     r0,0(r4)
+        addi    r4,r4,4
+        stw     r0,0(r3)
+        addi    r3,r3,4
+13:     bf      cr7*4+2,14f
+        lhz     r0,0(r4)
+        addi    r4,r4,2
+        sth     r0,0(r3)
+        addi    r3,r3,2
+14:     bf      cr7*4+3,15f
+        lbz     r0,0(r4)
+        stb     r0,0(r3)
+15:     ld      r3,48(r1)
+        blr
+.Lunwind_stack_nonvmx_copy:
+        addi    r1,r1,STACKFRAMESIZE
+        b       .Lnonvmx_copy
+#ifdef CONFIG_ALTIVEC
+.Lvmx_copy:
+        mflr    r0
+        std     r4,56(r1)
+        std     r5,64(r1)
+        std     r0,16(r1)
+        stdu    r1,-STACKFRAMESIZE(r1)
+        bl      .enter_vmx_copy
+        cmpwi   r3,0
+        ld      r0,STACKFRAMESIZE+16(r1)
+        ld      r3,STACKFRAMESIZE+48(r1)
+        ld      r4,STACKFRAMESIZE+56(r1)
+        ld      r5,STACKFRAMESIZE+64(r1)
+        mtlr    r0
+        /*
+         * We prefetch both the source and destination using enhanced touch
+         * instructions. We use a stream ID of 0 for the load side and
+         * 1 for the store side.
+         */
+        clrrdi  r6,r4,7
+        clrrdi  r9,r3,7
+        ori     r9,r9,1         /* stream=1 */
+        srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
+        cmpldi  cr1,r7,0x3FF
+        ble     cr1,1f
+        li      r7,0x3FF
+1:      lis     r0,0x0E00       /* depth=7 */
+        sldi    r7,r7,7
+        or      r7,r7,r0
+        ori     r10,r7,1        /* stream=1 */
+        lis     r8,0x8000       /* GO=1 */
+        clrldi  r8,r8,32
+.machine push
+.machine "power4"
+        dcbt    r0,r6,0b01000
+        dcbt    r0,r7,0b01010
+        dcbtst  r0,r9,0b01000
+        dcbtst  r0,r10,0b01010
+        eieio
+        dcbt    r0,r8,0b01010   /* GO */
+.machine pop
+        beq     .Lunwind_stack_nonvmx_copy
+        /*
+         * If source and destination are not relatively aligned we use a
+         * slower permute loop.
+         */
+        xor     r6,r4,r3
+        rldicl. r6,r6,0,(64-4)
+        bne     .Lvmx_unaligned_copy
+        /* Get the destination 16B aligned */
+        neg     r6,r3
+        mtocrf  0x01,r6
+        clrldi  r6,r6,(64-4)
+        bf      cr7*4+3,1f
+        lbz     r0,0(r4)
+        addi    r4,r4,1
+        stb     r0,0(r3)
+        addi    r3,r3,1
+1:      bf      cr7*4+2,2f
+        lhz     r0,0(r4)
+        addi    r4,r4,2
+        sth     r0,0(r3)
+        addi    r3,r3,2
+2:      bf      cr7*4+1,3f
+        lwz     r0,0(r4)
+        addi    r4,r4,4
+        stw     r0,0(r3)
+        addi    r3,r3,4
+3:      bf      cr7*4+0,4f
+        ld      r0,0(r4)
+        addi    r4,r4,8
+        std     r0,0(r3)
+        addi    r3,r3,8
+4:      sub     r5,r5,r6
+        /* Get the desination 128B aligned */
+        neg     r6,r3
+        srdi    r7,r6,4
+        mtocrf  0x01,r7
+        clrldi  r6,r6,(64-7)
+        li      r9,16
+        li      r10,32
+        li      r11,48
+        bf      cr7*4+3,5f
+        lvx     vr1,r0,r4
+        addi    r4,r4,16
+        stvx    vr1,r0,r3
+        addi    r3,r3,16
+5:      bf      cr7*4+2,6f
+        lvx     vr1,r0,r4
+        lvx     vr0,r4,r9
+        addi    r4,r4,32
+        stvx    vr1,r0,r3
+        stvx    vr0,r3,r9
+        addi    r3,r3,32
+6:      bf      cr7*4+1,7f
+        lvx     vr3,r0,r4
+        lvx     vr2,r4,r9
+        lvx     vr1,r4,r10
+        lvx     vr0,r4,r11
+        addi    r4,r4,64
+        stvx    vr3,r0,r3
+        stvx    vr2,r3,r9
+        stvx    vr1,r3,r10
+        stvx    vr0,r3,r11
+        addi    r3,r3,64
+7:      sub     r5,r5,r6
+        srdi    r6,r5,7
+        std     r14,STK_REG(R14)(r1)
+        std     r15,STK_REG(R15)(r1)
+        std     r16,STK_REG(R16)(r1)
+        li      r12,64
+        li      r14,80
+        li      r15,96
+        li      r16,112
+        mtctr   r6
+        /*
+         * Now do cacheline sized loads and stores. By this stage the
+         * cacheline stores are also cacheline aligned.
+         */
+        .align  5
+8:
+        lvx     vr7,r0,r4
+        lvx     vr6,r4,r9
+        lvx     vr5,r4,r10
+        lvx     vr4,r4,r11
+        lvx     vr3,r4,r12
+        lvx     vr2,r4,r14
+        lvx     vr1,r4,r15
+        lvx     vr0,r4,r16
+        addi    r4,r4,128
+        stvx    vr7,r0,r3
+        stvx    vr6,r3,r9
+        stvx    vr5,r3,r10
+        stvx    vr4,r3,r11
+        stvx    vr3,r3,r12
+        stvx    vr2,r3,r14
+        stvx    vr1,r3,r15
+        stvx    vr0,r3,r16
+        addi    r3,r3,128
+        bdnz    8b
+        ld      r14,STK_REG(R14)(r1)
+        ld      r15,STK_REG(R15)(r1)
+        ld      r16,STK_REG(R16)(r1)
+        /* Up to 127B to go */
+        clrldi  r5,r5,(64-7)
+        srdi    r6,r5,4
+        mtocrf  0x01,r6
+        bf      cr7*4+1,9f
+        lvx     vr3,r0,r4
+        lvx     vr2,r4,r9
+        lvx     vr1,r4,r10
+        lvx     vr0,r4,r11
+        addi    r4,r4,64
+        stvx    vr3,r0,r3
+        stvx    vr2,r3,r9
+        stvx    vr1,r3,r10
+        stvx    vr0,r3,r11
+        addi    r3,r3,64
+9:      bf      cr7*4+2,10f
+        lvx     vr1,r0,r4
+        lvx     vr0,r4,r9
+        addi    r4,r4,32
+        stvx    vr1,r0,r3
+        stvx    vr0,r3,r9
+        addi    r3,r3,32
+10:     bf      cr7*4+3,11f
+        lvx     vr1,r0,r4
+        addi    r4,r4,16
+        stvx    vr1,r0,r3
+        addi    r3,r3,16
+        /* Up to 15B to go */
+11:     clrldi  r5,r5,(64-4)
+        mtocrf  0x01,r5
+        bf      cr7*4+0,12f
+        ld      r0,0(r4)
+        addi    r4,r4,8
+        std     r0,0(r3)
+        addi    r3,r3,8
+12:     bf      cr7*4+1,13f
+        lwz     r0,0(r4)
+        addi    r4,r4,4
+        stw     r0,0(r3)
+        addi    r3,r3,4
+13:     bf      cr7*4+2,14f
+        lhz     r0,0(r4)
+        addi    r4,r4,2
+        sth     r0,0(r3)
+        addi    r3,r3,2
+14:     bf      cr7*4+3,15f
+        lbz     r0,0(r4)
+        stb     r0,0(r3)
+15:     addi    r1,r1,STACKFRAMESIZE
+        ld      r3,48(r1)
+        b       .exit_vmx_copy          /* tail call optimise */
+.Lvmx_unaligned_copy:
+        /* Get the destination 16B aligned */
+        neg     r6,r3
+        mtocrf  0x01,r6
+        clrldi  r6,r6,(64-4)
+        bf      cr7*4+3,1f
+        lbz     r0,0(r4)
+        addi    r4,r4,1
+        stb     r0,0(r3)
+        addi    r3,r3,1
+1:      bf      cr7*4+2,2f
+        lhz     r0,0(r4)
+        addi    r4,r4,2
+        sth     r0,0(r3)
+        addi    r3,r3,2
+2:      bf      cr7*4+1,3f
+        lwz     r0,0(r4)
+        addi    r4,r4,4
+        stw     r0,0(r3)
+        addi    r3,r3,4
+3:      bf      cr7*4+0,4f
+        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
+        lwz     r7,4(r4)
+        addi    r4,r4,8
+        stw     r0,0(r3)
+        stw     r7,4(r3)
+        addi    r3,r3,8
+4:      sub     r5,r5,r6
+        /* Get the desination 128B aligned */
+        neg     r6,r3
+        srdi    r7,r6,4
+        mtocrf  0x01,r7
+        clrldi  r6,r6,(64-7)
+        li      r9,16
+        li      r10,32
+        li      r11,48
+        lvsl    vr16,0,r4       /* Setup permute control vector */
+        lvx     vr0,0,r4
+        addi    r4,r4,16
+        bf      cr7*4+3,5f
+        lvx     vr1,r0,r4
+        vperm   vr8,vr0,vr1,vr16
+        addi    r4,r4,16
+        stvx    vr8,r0,r3
+        addi    r3,r3,16
+        vor     vr0,vr1,vr1
+5:      bf      cr7*4+2,6f
+        lvx     vr1,r0,r4
+        vperm   vr8,vr0,vr1,vr16
+        lvx     vr0,r4,r9
+        vperm   vr9,vr1,vr0,vr16
+        addi    r4,r4,32
+        stvx    vr8,r0,r3
+        stvx    vr9,r3,r9
+        addi    r3,r3,32
+6:      bf      cr7*4+1,7f
+        lvx     vr3,r0,r4
+        vperm   vr8,vr0,vr3,vr16
+        lvx     vr2,r4,r9
+        vperm   vr9,vr3,vr2,vr16
+        lvx     vr1,r4,r10
+        vperm   vr10,vr2,vr1,vr16
+        lvx     vr0,r4,r11
+        vperm   vr11,vr1,vr0,vr16
+        addi    r4,r4,64
+        stvx    vr8,r0,r3
+        stvx    vr9,r3,r9
+        stvx    vr10,r3,r10
+        stvx    vr11,r3,r11
+        addi    r3,r3,64
+7:      sub     r5,r5,r6
+        srdi    r6,r5,7
+        std     r14,STK_REG(R14)(r1)
+        std     r15,STK_REG(R15)(r1)
+        std     r16,STK_REG(R16)(r1)
+        li      r12,64
+        li      r14,80
+        li      r15,96
+        li      r16,112
+        mtctr   r6
+        /*
+         * Now do cacheline sized loads and stores. By this stage the
+         * cacheline stores are also cacheline aligned.
+         */
+        .align  5
+8:
+        lvx     vr7,r0,r4
+        vperm   vr8,vr0,vr7,vr16
+        lvx     vr6,r4,r9
+        vperm   vr9,vr7,vr6,vr16
+        lvx     vr5,r4,r10
+        vperm   vr10,vr6,vr5,vr16
+        lvx     vr4,r4,r11
+        vperm   vr11,vr5,vr4,vr16
+        lvx     vr3,r4,r12
+        vperm   vr12,vr4,vr3,vr16
+        lvx     vr2,r4,r14
+        vperm   vr13,vr3,vr2,vr16
+        lvx     vr1,r4,r15
+        vperm   vr14,vr2,vr1,vr16
+        lvx     vr0,r4,r16
+        vperm   vr15,vr1,vr0,vr16
+        addi    r4,r4,128
+        stvx    vr8,r0,r3
+        stvx    vr9,r3,r9
+        stvx    vr10,r3,r10
+        stvx    vr11,r3,r11
+        stvx    vr12,r3,r12
+        stvx    vr13,r3,r14
+        stvx    vr14,r3,r15
+        stvx    vr15,r3,r16
+        addi    r3,r3,128
+        bdnz    8b
+        ld      r14,STK_REG(R14)(r1)
+        ld      r15,STK_REG(R15)(r1)
+        ld      r16,STK_REG(R16)(r1)
+        /* Up to 127B to go */
+        clrldi  r5,r5,(64-7)
+        srdi    r6,r5,4
+        mtocrf  0x01,r6
+        bf      cr7*4+1,9f
+        lvx     vr3,r0,r4
+        vperm   vr8,vr0,vr3,vr16
+        lvx     vr2,r4,r9
+        vperm   vr9,vr3,vr2,vr16
+        lvx     vr1,r4,r10
+        vperm   vr10,vr2,vr1,vr16
+        lvx     vr0,r4,r11
+        vperm   vr11,vr1,vr0,vr16
+        addi    r4,r4,64
+        stvx    vr8,r0,r3
+        stvx    vr9,r3,r9
+        stvx    vr10,r3,r10
+        stvx    vr11,r3,r11
+        addi    r3,r3,64
+9:      bf      cr7*4+2,10f
+        lvx     vr1,r0,r4
+        vperm   vr8,vr0,vr1,vr16
+        lvx     vr0,r4,r9
+        vperm   vr9,vr1,vr0,vr16
+        addi    r4,r4,32
+        stvx    vr8,r0,r3
+        stvx    vr9,r3,r9
+        addi    r3,r3,32
+10:     bf      cr7*4+3,11f
+        lvx     vr1,r0,r4
+        vperm   vr8,vr0,vr1,vr16
+        addi    r4,r4,16
+        stvx    vr8,r0,r3
+        addi    r3,r3,16
+        /* Up to 15B to go */
+11:     clrldi  r5,r5,(64-4)
+        addi    r4,r4,-16       /* Unwind the +16 load offset */
+        mtocrf  0x01,r5
+        bf      cr7*4+0,12f
+        lwz     r0,0(r4)        /* Less chance of a reject with word ops */
+        lwz     r6,4(r4)
+        addi    r4,r4,8
+        stw     r0,0(r3)
+        stw     r6,4(r3)
+        addi    r3,r3,8
+12:     bf      cr7*4+1,13f
+        lwz     r0,0(r4)
+        addi    r4,r4,4
+        stw     r0,0(r3)
+        addi    r3,r3,4
+13:     bf      cr7*4+2,14f
+        lhz     r0,0(r4)
+        addi    r4,r4,2
+        sth     r0,0(r3)
+        addi    r3,r3,2
+14:     bf      cr7*4+3,15f
+        lbz     r0,0(r4)
+        stb     r0,0(r3)
+15:     addi    r1,r1,STACKFRAMESIZE
+        ld      r3,48(r1)
+        b       .exit_vmx_copy          /* tail call optimise */
+#endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 093d6316435c..1b5a0a09d609 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -119,6 +119,7 @@ _GLOBAL(memchr)
 2:      li      r3,0
        blr
+#ifdef CONFIG_PPC32
 _GLOBAL(__clear_user)
        addi    r6,r3,-4
        li      r3,0
@@ -160,3 +161,4 @@ _GLOBAL(__clear_user)
        PPC_LONG        1b,91b
        PPC_LONG        8b,92b
        .text
+#endif
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
new file mode 100644
index 000000000000..3b1e48049faf
--- /dev/null
+++ b/arch/powerpc/lib/string_64.S
@@ -0,0 +1,202 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton@au.ibm.com>
+ */
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+        .section        ".toc","aw"
+PPC64_CACHES:
+        .tc             ppc64_caches[TC],ppc64_caches
+        .section        ".text"
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to:   Destination address, in user space.
+ * @n:    Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.  Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+        .macro err1
+100:
+        .section __ex_table,"a"
+        .align 3
+        .llong 100b,.Ldo_err1
+        .previous
+        .endm
+        .macro err2
+200:
+        .section __ex_table,"a"
+        .align 3
+        .llong 200b,.Ldo_err2
+        .previous
+        .endm
+        .macro err3
+300:
+        .section __ex_table,"a"
+        .align 3
+        .llong 300b,.Ldo_err3
+        .previous
+        .endm
+.Ldo_err1:
+        mr      r3,r8
+.Ldo_err2:
+        mtctr   r4
+1:
+err3;   stb     r0,0(r3)
+        addi    r3,r3,1
+        addi    r4,r4,-1
+        bdnz    1b
+.Ldo_err3:
+        mr      r3,r4
+        blr
+_GLOBAL(__clear_user)
+        cmpdi   r4,32
+        neg     r6,r3
+        li      r0,0
+        blt     .Lshort_clear
+        mr      r8,r3
+        mtocrf  0x01,r6
+        clrldi  r6,r6,(64-3)
+        /* Get the destination 8 byte aligned */
+        bf      cr7*4+3,1f
+err1;   stb     r0,0(r3)
+        addi    r3,r3,1
+1:      bf      cr7*4+2,2f
+err1;   sth     r0,0(r3)
+        addi    r3,r3,2
+2:      bf      cr7*4+1,3f
+err1;   stw     r0,0(r3)
+        addi    r3,r3,4
+3:      sub     r4,r4,r6
+        cmpdi   r4,32
+        cmpdi   cr1,r4,512
+        blt     .Lshort_clear
+        bgt     cr1,.Llong_clear
+.Lmedium_clear:
+        srdi    r6,r4,5
+        mtctr   r6
+        /* Do 32 byte chunks */
+4:
+err2;   std     r0,0(r3)
+err2;   std     r0,8(r3)
+err2;   std     r0,16(r3)
+err2;   std     r0,24(r3)
+        addi    r3,r3,32
+        addi    r4,r4,-32
+        bdnz    4b
+.Lshort_clear:
+        /* up to 31 bytes to go */
+        cmpdi   r4,16
+        blt     6f
+err2;   std     r0,0(r3)
+err2;   std     r0,8(r3)
+        addi    r3,r3,16
+        addi    r4,r4,-16
+        /* Up to 15 bytes to go */
+6:      mr      r8,r3
+        clrldi  r4,r4,(64-4)
+        mtocrf  0x01,r4
+        bf      cr7*4+0,7f
+err1;   std     r0,0(r3)
+        addi    r3,r3,8
+7:      bf      cr7*4+1,8f
+err1;   stw     r0,0(r3)
+        addi    r3,r3,4
+8:      bf      cr7*4+2,9f
+err1;   sth     r0,0(r3)
+        addi    r3,r3,2
+9:      bf      cr7*4+3,10f
+err1;   stb     r0,0(r3)
+10:     li      r3,0
+        blr
+.Llong_clear:
+        ld      r5,PPC64_CACHES@toc(r2)
+        bf      cr7*4+0,11f
+err2;   std     r0,0(r3)
+        addi    r3,r3,8
+        addi    r4,r4,-8
+        /* Destination is 16 byte aligned, need to get it cacheline aligned */
+11:     lwz     r7,DCACHEL1LOGLINESIZE(r5)
+        lwz     r9,DCACHEL1LINESIZE(r5)
+        /*
+         * With worst case alignment the long clear loop takes a minimum
+         * of 1 byte less than 2 cachelines.
+         */
+        sldi    r10,r9,2
+        cmpd    r4,r10
+        blt     .Lmedium_clear
+        neg     r6,r3
+        addi    r10,r9,-1
+        and.    r5,r6,r10
+        beq     13f
+        srdi    r6,r5,4
+        mtctr   r6
+        mr      r8,r3
+12:
+err1;   std     r0,0(r3)
+err1;   std     r0,8(r3)
+        addi    r3,r3,16
+        bdnz    12b
+        sub     r4,r4,r5
+13:     srd     r6,r4,r7
+        mtctr   r6
+        mr      r8,r3
+14:
+err1;   dcbz    r0,r3
+        add     r3,r3,r9
+        bdnz    14b
+        and     r4,r4,r10
+        cmpdi   r4,32
+        blt     .Lshort_clear
+        b       .Lmedium_clear
diff --git a/arch/powerpc/lib/copyuser_power7_vmx.c b/arch/powerpc/lib/vmx-helper.c
index bf2654f2b68e..3cf529ceec5b 100644
--- a/arch/powerpc/lib/copyuser_power7_vmx.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -22,7 +22,7 @@
 #include <linux/hardirq.h>
 #include <asm/switch_to.h>
-int enter_vmx_copy(void)
+int enter_vmx_usercopy(void)
 {
        if (in_interrupt())
                return 0;
@@ -44,8 +44,31 @@ int enter_vmx_copy(void)
 * This function must return 0 because we tail call optimise when calling
 * from __copy_tofrom_user_power7 which returns 0 on success.
 */
-int exit_vmx_copy(void)
+int exit_vmx_usercopy(void)
 {
        pagefault_enable();
        return 0;
 }
+int enter_vmx_copy(void)
+{
+        if (in_interrupt())
+                return 0;
+        preempt_disable();
+        enable_kernel_altivec();
+        return 1;
+}
+/*
+ * All calls to this function will be optimised into tail calls. We are
+ * passed a pointer to the destination which we return as required by a
+ * memcpy implementation.
+ */
+void *exit_vmx_copy(void *dest)
+{
+        preempt_enable();
+        return dest;
+}