aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r--arch/powerpc/lib/Makefile5
-rw-r--r--arch/powerpc/lib/checksum_64.S27
-rw-r--r--arch/powerpc/lib/code-patching.c14
-rw-r--r--arch/powerpc/lib/copypage_64.S4
-rw-r--r--arch/powerpc/lib/copypage_power7.S165
-rw-r--r--arch/powerpc/lib/copyuser_power7.S157
-rw-r--r--arch/powerpc/lib/crtsavres.S5
-rw-r--r--arch/powerpc/lib/hweight_64.S14
-rw-r--r--arch/powerpc/lib/ldstfp.S12
-rw-r--r--arch/powerpc/lib/memcpy_64.S4
-rw-r--r--arch/powerpc/lib/memcpy_power7.S647
-rw-r--r--arch/powerpc/lib/string.S2
-rw-r--r--arch/powerpc/lib/string_64.S202
-rw-r--r--arch/powerpc/lib/vmx-helper.c (renamed from arch/powerpc/lib/copyuser_power7_vmx.c)27
14 files changed, 1199 insertions, 86 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile
index 7735a2c2e6d9..746e0c895cd7 100644
--- a/arch/powerpc/lib/Makefile
+++ b/arch/powerpc/lib/Makefile
@@ -17,14 +17,15 @@ obj-$(CONFIG_HAS_IOMEM) += devres.o
17obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ 17obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \
18 memcpy_64.o usercopy_64.o mem_64.o string.o \ 18 memcpy_64.o usercopy_64.o mem_64.o string.o \
19 checksum_wrappers_64.o hweight_64.o \ 19 checksum_wrappers_64.o hweight_64.o \
20 copyuser_power7.o 20 copyuser_power7.o string_64.o copypage_power7.o \
21 memcpy_power7.o
21obj-$(CONFIG_XMON) += sstep.o ldstfp.o 22obj-$(CONFIG_XMON) += sstep.o ldstfp.o
22obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o 23obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o
23obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o 24obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o
24 25
25ifeq ($(CONFIG_PPC64),y) 26ifeq ($(CONFIG_PPC64),y)
26obj-$(CONFIG_SMP) += locks.o 27obj-$(CONFIG_SMP) += locks.o
27obj-$(CONFIG_ALTIVEC) += copyuser_power7_vmx.o 28obj-$(CONFIG_ALTIVEC) += vmx-helper.o
28endif 29endif
29 30
30obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o 31obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 18245af38aea..167f72555d60 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -65,9 +65,6 @@ _GLOBAL(csum_tcpudp_magic)
65 srwi r3,r3,16 65 srwi r3,r3,16
66 blr 66 blr
67 67
68#define STACKFRAMESIZE 256
69#define STK_REG(i) (112 + ((i)-14)*8)
70
71/* 68/*
72 * Computes the checksum of a memory block at buff, length len, 69 * Computes the checksum of a memory block at buff, length len,
73 * and adds in "sum" (32-bit). 70 * and adds in "sum" (32-bit).
@@ -114,9 +111,9 @@ _GLOBAL(csum_partial)
114 mtctr r6 111 mtctr r6
115 112
116 stdu r1,-STACKFRAMESIZE(r1) 113 stdu r1,-STACKFRAMESIZE(r1)
117 std r14,STK_REG(r14)(r1) 114 std r14,STK_REG(R14)(r1)
118 std r15,STK_REG(r15)(r1) 115 std r15,STK_REG(R15)(r1)
119 std r16,STK_REG(r16)(r1) 116 std r16,STK_REG(R16)(r1)
120 117
121 ld r6,0(r3) 118 ld r6,0(r3)
122 ld r9,8(r3) 119 ld r9,8(r3)
@@ -175,9 +172,9 @@ _GLOBAL(csum_partial)
175 adde r0,r0,r15 172 adde r0,r0,r15
176 adde r0,r0,r16 173 adde r0,r0,r16
177 174
178 ld r14,STK_REG(r14)(r1) 175 ld r14,STK_REG(R14)(r1)
179 ld r15,STK_REG(r15)(r1) 176 ld r15,STK_REG(R15)(r1)
180 ld r16,STK_REG(r16)(r1) 177 ld r16,STK_REG(R16)(r1)
181 addi r1,r1,STACKFRAMESIZE 178 addi r1,r1,STACKFRAMESIZE
182 179
183 andi. r4,r4,63 180 andi. r4,r4,63
@@ -299,9 +296,9 @@ dest; sth r6,0(r4)
299 mtctr r6 296 mtctr r6
300 297
301 stdu r1,-STACKFRAMESIZE(r1) 298 stdu r1,-STACKFRAMESIZE(r1)
302 std r14,STK_REG(r14)(r1) 299 std r14,STK_REG(R14)(r1)
303 std r15,STK_REG(r15)(r1) 300 std r15,STK_REG(R15)(r1)
304 std r16,STK_REG(r16)(r1) 301 std r16,STK_REG(R16)(r1)
305 302
306source; ld r6,0(r3) 303source; ld r6,0(r3)
307source; ld r9,8(r3) 304source; ld r9,8(r3)
@@ -382,9 +379,9 @@ dest; std r16,56(r4)
382 adde r0,r0,r15 379 adde r0,r0,r15
383 adde r0,r0,r16 380 adde r0,r0,r16
384 381
385 ld r14,STK_REG(r14)(r1) 382 ld r14,STK_REG(R14)(r1)
386 ld r15,STK_REG(r15)(r1) 383 ld r15,STK_REG(R15)(r1)
387 ld r16,STK_REG(r16)(r1) 384 ld r16,STK_REG(R16)(r1)
388 addi r1,r1,STACKFRAMESIZE 385 addi r1,r1,STACKFRAMESIZE
389 386
390 andi. r5,r5,63 387 andi. r5,r5,63
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 7c975d43e3f3..dd223b3eb333 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -13,17 +13,23 @@
13#include <linux/mm.h> 13#include <linux/mm.h>
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/code-patching.h> 15#include <asm/code-patching.h>
16#include <asm/uaccess.h>
16 17
17 18
18void patch_instruction(unsigned int *addr, unsigned int instr) 19int patch_instruction(unsigned int *addr, unsigned int instr)
19{ 20{
20 *addr = instr; 21 int err;
22
23 err = __put_user(instr, addr);
24 if (err)
25 return err;
21 asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr)); 26 asm ("dcbst 0, %0; sync; icbi 0,%0; sync; isync" : : "r" (addr));
27 return 0;
22} 28}
23 29
24void patch_branch(unsigned int *addr, unsigned long target, int flags) 30int patch_branch(unsigned int *addr, unsigned long target, int flags)
25{ 31{
26 patch_instruction(addr, create_branch(addr, target, flags)); 32 return patch_instruction(addr, create_branch(addr, target, flags));
27} 33}
28 34
29unsigned int create_branch(const unsigned int *addr, 35unsigned int create_branch(const unsigned int *addr,
diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S
index 53dcb6b1b708..9f9434a85264 100644
--- a/arch/powerpc/lib/copypage_64.S
+++ b/arch/powerpc/lib/copypage_64.S
@@ -17,7 +17,11 @@ PPC64_CACHES:
17 .section ".text" 17 .section ".text"
18 18
19_GLOBAL(copy_page) 19_GLOBAL(copy_page)
20BEGIN_FTR_SECTION
20 lis r5,PAGE_SIZE@h 21 lis r5,PAGE_SIZE@h
22FTR_SECTION_ELSE
23 b .copypage_power7
24ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
21 ori r5,r5,PAGE_SIZE@l 25 ori r5,r5,PAGE_SIZE@l
22BEGIN_FTR_SECTION 26BEGIN_FTR_SECTION
23 ld r10,PPC64_CACHES@toc(r2) 27 ld r10,PPC64_CACHES@toc(r2)
diff --git a/arch/powerpc/lib/copypage_power7.S b/arch/powerpc/lib/copypage_power7.S
new file mode 100644
index 000000000000..0ef75bf0695c
--- /dev/null
+++ b/arch/powerpc/lib/copypage_power7.S
@@ -0,0 +1,165 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/page.h>
21#include <asm/ppc_asm.h>
22
23_GLOBAL(copypage_power7)
24 /*
25 * We prefetch both the source and destination using enhanced touch
26 * instructions. We use a stream ID of 0 for the load side and
27 * 1 for the store side. Since source and destination are page
28 * aligned we don't need to clear the bottom 7 bits of either
29 * address.
30 */
31 ori r9,r3,1 /* stream=1 */
32
33#ifdef CONFIG_PPC_64K_PAGES
34 lis r7,0x0E01 /* depth=7, units=512 */
35#else
36 lis r7,0x0E00 /* depth=7 */
37 ori r7,r7,0x1000 /* units=32 */
38#endif
39 ori r10,r7,1 /* stream=1 */
40
41 lis r8,0x8000 /* GO=1 */
42 clrldi r8,r8,32
43
44.machine push
45.machine "power4"
46 dcbt r0,r4,0b01000
47 dcbt r0,r7,0b01010
48 dcbtst r0,r9,0b01000
49 dcbtst r0,r10,0b01010
50 eieio
51 dcbt r0,r8,0b01010 /* GO */
52.machine pop
53
54#ifdef CONFIG_ALTIVEC
55 mflr r0
56 std r3,48(r1)
57 std r4,56(r1)
58 std r0,16(r1)
59 stdu r1,-STACKFRAMESIZE(r1)
60 bl .enter_vmx_copy
61 cmpwi r3,0
62 ld r0,STACKFRAMESIZE+16(r1)
63 ld r3,STACKFRAMESIZE+48(r1)
64 ld r4,STACKFRAMESIZE+56(r1)
65 mtlr r0
66
67 li r0,(PAGE_SIZE/128)
68 mtctr r0
69
70 beq .Lnonvmx_copy
71
72 addi r1,r1,STACKFRAMESIZE
73
74 li r6,16
75 li r7,32
76 li r8,48
77 li r9,64
78 li r10,80
79 li r11,96
80 li r12,112
81
82 .align 5
831: lvx vr7,r0,r4
84 lvx vr6,r4,r6
85 lvx vr5,r4,r7
86 lvx vr4,r4,r8
87 lvx vr3,r4,r9
88 lvx vr2,r4,r10
89 lvx vr1,r4,r11
90 lvx vr0,r4,r12
91 addi r4,r4,128
92 stvx vr7,r0,r3
93 stvx vr6,r3,r6
94 stvx vr5,r3,r7
95 stvx vr4,r3,r8
96 stvx vr3,r3,r9
97 stvx vr2,r3,r10
98 stvx vr1,r3,r11
99 stvx vr0,r3,r12
100 addi r3,r3,128
101 bdnz 1b
102
103 b .exit_vmx_copy /* tail call optimise */
104
105#else
106 li r0,(PAGE_SIZE/128)
107 mtctr r0
108
109 stdu r1,-STACKFRAMESIZE(r1)
110#endif
111
112.Lnonvmx_copy:
113 std r14,STK_REG(R14)(r1)
114 std r15,STK_REG(R15)(r1)
115 std r16,STK_REG(R16)(r1)
116 std r17,STK_REG(R17)(r1)
117 std r18,STK_REG(R18)(r1)
118 std r19,STK_REG(R19)(r1)
119 std r20,STK_REG(R20)(r1)
120
1211: ld r0,0(r4)
122 ld r5,8(r4)
123 ld r6,16(r4)
124 ld r7,24(r4)
125 ld r8,32(r4)
126 ld r9,40(r4)
127 ld r10,48(r4)
128 ld r11,56(r4)
129 ld r12,64(r4)
130 ld r14,72(r4)
131 ld r15,80(r4)
132 ld r16,88(r4)
133 ld r17,96(r4)
134 ld r18,104(r4)
135 ld r19,112(r4)
136 ld r20,120(r4)
137 addi r4,r4,128
138 std r0,0(r3)
139 std r5,8(r3)
140 std r6,16(r3)
141 std r7,24(r3)
142 std r8,32(r3)
143 std r9,40(r3)
144 std r10,48(r3)
145 std r11,56(r3)
146 std r12,64(r3)
147 std r14,72(r3)
148 std r15,80(r3)
149 std r16,88(r3)
150 std r17,96(r3)
151 std r18,104(r3)
152 std r19,112(r3)
153 std r20,120(r3)
154 addi r3,r3,128
155 bdnz 1b
156
157 ld r14,STK_REG(R14)(r1)
158 ld r15,STK_REG(R15)(r1)
159 ld r16,STK_REG(R16)(r1)
160 ld r17,STK_REG(R17)(r1)
161 ld r18,STK_REG(R18)(r1)
162 ld r19,STK_REG(R19)(r1)
163 ld r20,STK_REG(R20)(r1)
164 addi r1,r1,STACKFRAMESIZE
165 blr
diff --git a/arch/powerpc/lib/copyuser_power7.S b/arch/powerpc/lib/copyuser_power7.S
index 497db7b23bb1..f9ede7c6606e 100644
--- a/arch/powerpc/lib/copyuser_power7.S
+++ b/arch/powerpc/lib/copyuser_power7.S
@@ -19,9 +19,6 @@
19 */ 19 */
20#include <asm/ppc_asm.h> 20#include <asm/ppc_asm.h>
21 21
22#define STACKFRAMESIZE 256
23#define STK_REG(i) (112 + ((i)-14)*8)
24
25 .macro err1 22 .macro err1
26100: 23100:
27 .section __ex_table,"a" 24 .section __ex_table,"a"
@@ -57,26 +54,26 @@
57 54
58 55
59.Ldo_err4: 56.Ldo_err4:
60 ld r16,STK_REG(r16)(r1) 57 ld r16,STK_REG(R16)(r1)
61 ld r15,STK_REG(r15)(r1) 58 ld r15,STK_REG(R15)(r1)
62 ld r14,STK_REG(r14)(r1) 59 ld r14,STK_REG(R14)(r1)
63.Ldo_err3: 60.Ldo_err3:
64 bl .exit_vmx_copy 61 bl .exit_vmx_usercopy
65 ld r0,STACKFRAMESIZE+16(r1) 62 ld r0,STACKFRAMESIZE+16(r1)
66 mtlr r0 63 mtlr r0
67 b .Lexit 64 b .Lexit
68#endif /* CONFIG_ALTIVEC */ 65#endif /* CONFIG_ALTIVEC */
69 66
70.Ldo_err2: 67.Ldo_err2:
71 ld r22,STK_REG(r22)(r1) 68 ld r22,STK_REG(R22)(r1)
72 ld r21,STK_REG(r21)(r1) 69 ld r21,STK_REG(R21)(r1)
73 ld r20,STK_REG(r20)(r1) 70 ld r20,STK_REG(R20)(r1)
74 ld r19,STK_REG(r19)(r1) 71 ld r19,STK_REG(R19)(r1)
75 ld r18,STK_REG(r18)(r1) 72 ld r18,STK_REG(R18)(r1)
76 ld r17,STK_REG(r17)(r1) 73 ld r17,STK_REG(R17)(r1)
77 ld r16,STK_REG(r16)(r1) 74 ld r16,STK_REG(R16)(r1)
78 ld r15,STK_REG(r15)(r1) 75 ld r15,STK_REG(R15)(r1)
79 ld r14,STK_REG(r14)(r1) 76 ld r14,STK_REG(R14)(r1)
80.Lexit: 77.Lexit:
81 addi r1,r1,STACKFRAMESIZE 78 addi r1,r1,STACKFRAMESIZE
82.Ldo_err1: 79.Ldo_err1:
@@ -137,15 +134,15 @@ err1; stw r0,0(r3)
137 134
138 mflr r0 135 mflr r0
139 stdu r1,-STACKFRAMESIZE(r1) 136 stdu r1,-STACKFRAMESIZE(r1)
140 std r14,STK_REG(r14)(r1) 137 std r14,STK_REG(R14)(r1)
141 std r15,STK_REG(r15)(r1) 138 std r15,STK_REG(R15)(r1)
142 std r16,STK_REG(r16)(r1) 139 std r16,STK_REG(R16)(r1)
143 std r17,STK_REG(r17)(r1) 140 std r17,STK_REG(R17)(r1)
144 std r18,STK_REG(r18)(r1) 141 std r18,STK_REG(R18)(r1)
145 std r19,STK_REG(r19)(r1) 142 std r19,STK_REG(R19)(r1)
146 std r20,STK_REG(r20)(r1) 143 std r20,STK_REG(R20)(r1)
147 std r21,STK_REG(r21)(r1) 144 std r21,STK_REG(R21)(r1)
148 std r22,STK_REG(r22)(r1) 145 std r22,STK_REG(R22)(r1)
149 std r0,STACKFRAMESIZE+16(r1) 146 std r0,STACKFRAMESIZE+16(r1)
150 147
151 srdi r6,r5,7 148 srdi r6,r5,7
@@ -192,15 +189,15 @@ err2; std r21,120(r3)
192 189
193 clrldi r5,r5,(64-7) 190 clrldi r5,r5,(64-7)
194 191
195 ld r14,STK_REG(r14)(r1) 192 ld r14,STK_REG(R14)(r1)
196 ld r15,STK_REG(r15)(r1) 193 ld r15,STK_REG(R15)(r1)
197 ld r16,STK_REG(r16)(r1) 194 ld r16,STK_REG(R16)(r1)
198 ld r17,STK_REG(r17)(r1) 195 ld r17,STK_REG(R17)(r1)
199 ld r18,STK_REG(r18)(r1) 196 ld r18,STK_REG(R18)(r1)
200 ld r19,STK_REG(r19)(r1) 197 ld r19,STK_REG(R19)(r1)
201 ld r20,STK_REG(r20)(r1) 198 ld r20,STK_REG(R20)(r1)
202 ld r21,STK_REG(r21)(r1) 199 ld r21,STK_REG(R21)(r1)
203 ld r22,STK_REG(r22)(r1) 200 ld r22,STK_REG(R22)(r1)
204 addi r1,r1,STACKFRAMESIZE 201 addi r1,r1,STACKFRAMESIZE
205 202
206 /* Up to 127B to go */ 203 /* Up to 127B to go */
@@ -290,7 +287,7 @@ err1; stb r0,0(r3)
290 mflr r0 287 mflr r0
291 std r0,16(r1) 288 std r0,16(r1)
292 stdu r1,-STACKFRAMESIZE(r1) 289 stdu r1,-STACKFRAMESIZE(r1)
293 bl .enter_vmx_copy 290 bl .enter_vmx_usercopy
294 cmpwi r3,0 291 cmpwi r3,0
295 ld r0,STACKFRAMESIZE+16(r1) 292 ld r0,STACKFRAMESIZE+16(r1)
296 ld r3,STACKFRAMESIZE+48(r1) 293 ld r3,STACKFRAMESIZE+48(r1)
@@ -298,6 +295,68 @@ err1; stb r0,0(r3)
298 ld r5,STACKFRAMESIZE+64(r1) 295 ld r5,STACKFRAMESIZE+64(r1)
299 mtlr r0 296 mtlr r0
300 297
298 /*
299 * We prefetch both the source and destination using enhanced touch
300 * instructions. We use a stream ID of 0 for the load side and
301 * 1 for the store side.
302 */
303 clrrdi r6,r4,7
304 clrrdi r9,r3,7
305 ori r9,r9,1 /* stream=1 */
306
307 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
308 cmpldi r7,0x3FF
309 ble 1f
310 li r7,0x3FF
3111: lis r0,0x0E00 /* depth=7 */
312 sldi r7,r7,7
313 or r7,r7,r0
314 ori r10,r7,1 /* stream=1 */
315
316 lis r8,0x8000 /* GO=1 */
317 clrldi r8,r8,32
318
319.machine push
320.machine "power4"
321 dcbt r0,r6,0b01000
322 dcbt r0,r7,0b01010
323 dcbtst r0,r9,0b01000
324 dcbtst r0,r10,0b01010
325 eieio
326 dcbt r0,r8,0b01010 /* GO */
327.machine pop
328
329 /*
330 * We prefetch both the source and destination using enhanced touch
331 * instructions. We use a stream ID of 0 for the load side and
332 * 1 for the store side.
333 */
334 clrrdi r6,r4,7
335 clrrdi r9,r3,7
336 ori r9,r9,1 /* stream=1 */
337
338 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
339 cmpldi cr1,r7,0x3FF
340 ble cr1,1f
341 li r7,0x3FF
3421: lis r0,0x0E00 /* depth=7 */
343 sldi r7,r7,7
344 or r7,r7,r0
345 ori r10,r7,1 /* stream=1 */
346
347 lis r8,0x8000 /* GO=1 */
348 clrldi r8,r8,32
349
350.machine push
351.machine "power4"
352 dcbt r0,r6,0b01000
353 dcbt r0,r7,0b01010
354 dcbtst r0,r9,0b01000
355 dcbtst r0,r10,0b01010
356 eieio
357 dcbt r0,r8,0b01010 /* GO */
358.machine pop
359
301 beq .Lunwind_stack_nonvmx_copy 360 beq .Lunwind_stack_nonvmx_copy
302 361
303 /* 362 /*
@@ -378,9 +437,9 @@ err3; stvx vr0,r3,r11
3787: sub r5,r5,r6 4377: sub r5,r5,r6
379 srdi r6,r5,7 438 srdi r6,r5,7
380 439
381 std r14,STK_REG(r14)(r1) 440 std r14,STK_REG(R14)(r1)
382 std r15,STK_REG(r15)(r1) 441 std r15,STK_REG(R15)(r1)
383 std r16,STK_REG(r16)(r1) 442 std r16,STK_REG(R16)(r1)
384 443
385 li r12,64 444 li r12,64
386 li r14,80 445 li r14,80
@@ -415,9 +474,9 @@ err4; stvx vr0,r3,r16
415 addi r3,r3,128 474 addi r3,r3,128
416 bdnz 8b 475 bdnz 8b
417 476
418 ld r14,STK_REG(r14)(r1) 477 ld r14,STK_REG(R14)(r1)
419 ld r15,STK_REG(r15)(r1) 478 ld r15,STK_REG(R15)(r1)
420 ld r16,STK_REG(r16)(r1) 479 ld r16,STK_REG(R16)(r1)
421 480
422 /* Up to 127B to go */ 481 /* Up to 127B to go */
423 clrldi r5,r5,(64-7) 482 clrldi r5,r5,(64-7)
@@ -476,7 +535,7 @@ err3; lbz r0,0(r4)
476err3; stb r0,0(r3) 535err3; stb r0,0(r3)
477 536
47815: addi r1,r1,STACKFRAMESIZE 53715: addi r1,r1,STACKFRAMESIZE
479 b .exit_vmx_copy /* tail call optimise */ 538 b .exit_vmx_usercopy /* tail call optimise */
480 539
481.Lvmx_unaligned_copy: 540.Lvmx_unaligned_copy:
482 /* Get the destination 16B aligned */ 541 /* Get the destination 16B aligned */
@@ -563,9 +622,9 @@ err3; stvx vr11,r3,r11
5637: sub r5,r5,r6 6227: sub r5,r5,r6
564 srdi r6,r5,7 623 srdi r6,r5,7
565 624
566 std r14,STK_REG(r14)(r1) 625 std r14,STK_REG(R14)(r1)
567 std r15,STK_REG(r15)(r1) 626 std r15,STK_REG(R15)(r1)
568 std r16,STK_REG(r16)(r1) 627 std r16,STK_REG(R16)(r1)
569 628
570 li r12,64 629 li r12,64
571 li r14,80 630 li r14,80
@@ -608,9 +667,9 @@ err4; stvx vr15,r3,r16
608 addi r3,r3,128 667 addi r3,r3,128
609 bdnz 8b 668 bdnz 8b
610 669
611 ld r14,STK_REG(r14)(r1) 670 ld r14,STK_REG(R14)(r1)
612 ld r15,STK_REG(r15)(r1) 671 ld r15,STK_REG(R15)(r1)
613 ld r16,STK_REG(r16)(r1) 672 ld r16,STK_REG(R16)(r1)
614 673
615 /* Up to 127B to go */ 674 /* Up to 127B to go */
616 clrldi r5,r5,(64-7) 675 clrldi r5,r5,(64-7)
@@ -679,5 +738,5 @@ err3; lbz r0,0(r4)
679err3; stb r0,0(r3) 738err3; stb r0,0(r3)
680 739
68115: addi r1,r1,STACKFRAMESIZE 74015: addi r1,r1,STACKFRAMESIZE
682 b .exit_vmx_copy /* tail call optimise */ 741 b .exit_vmx_usercopy /* tail call optimise */
683#endif /* CONFiG_ALTIVEC */ 742#endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/crtsavres.S b/arch/powerpc/lib/crtsavres.S
index 1c893f05d224..b2c68ce139ae 100644
--- a/arch/powerpc/lib/crtsavres.S
+++ b/arch/powerpc/lib/crtsavres.S
@@ -41,12 +41,13 @@
41#include <asm/ppc_asm.h> 41#include <asm/ppc_asm.h>
42 42
43 .file "crtsavres.S" 43 .file "crtsavres.S"
44 .section ".text"
45 44
46#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE 45#ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
47 46
48#ifndef CONFIG_PPC64 47#ifndef CONFIG_PPC64
49 48
49 .section ".text"
50
50/* Routines for saving integer registers, called by the compiler. */ 51/* Routines for saving integer registers, called by the compiler. */
51/* Called with r11 pointing to the stack header word of the caller of the */ 52/* Called with r11 pointing to the stack header word of the caller of the */
52/* function, just beyond the end of the integer save area. */ 53/* function, just beyond the end of the integer save area. */
@@ -232,6 +233,8 @@ _GLOBAL(_rest32gpr_31_x)
232 233
233#else /* CONFIG_PPC64 */ 234#else /* CONFIG_PPC64 */
234 235
236 .section ".text.save.restore","ax",@progbits
237
235.globl _savegpr0_14 238.globl _savegpr0_14
236_savegpr0_14: 239_savegpr0_14:
237 std r14,-144(r1) 240 std r14,-144(r1)
diff --git a/arch/powerpc/lib/hweight_64.S b/arch/powerpc/lib/hweight_64.S
index fda27868cf8c..9b96ff2ecd4d 100644
--- a/arch/powerpc/lib/hweight_64.S
+++ b/arch/powerpc/lib/hweight_64.S
@@ -28,7 +28,7 @@ BEGIN_FTR_SECTION
28 nop 28 nop
29 nop 29 nop
30FTR_SECTION_ELSE 30FTR_SECTION_ELSE
31 PPC_POPCNTB(r3,r3) 31 PPC_POPCNTB(R3,R3)
32 clrldi r3,r3,64-8 32 clrldi r3,r3,64-8
33 blr 33 blr
34ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB) 34ALT_FTR_SECTION_END_IFCLR(CPU_FTR_POPCNTB)
@@ -42,14 +42,14 @@ BEGIN_FTR_SECTION
42 nop 42 nop
43FTR_SECTION_ELSE 43FTR_SECTION_ELSE
44 BEGIN_FTR_SECTION_NESTED(50) 44 BEGIN_FTR_SECTION_NESTED(50)
45 PPC_POPCNTB(r3,r3) 45 PPC_POPCNTB(R3,R3)
46 srdi r4,r3,8 46 srdi r4,r3,8
47 add r3,r4,r3 47 add r3,r4,r3
48 clrldi r3,r3,64-8 48 clrldi r3,r3,64-8
49 blr 49 blr
50 FTR_SECTION_ELSE_NESTED(50) 50 FTR_SECTION_ELSE_NESTED(50)
51 clrlwi r3,r3,16 51 clrlwi r3,r3,16
52 PPC_POPCNTW(r3,r3) 52 PPC_POPCNTW(R3,R3)
53 clrldi r3,r3,64-8 53 clrldi r3,r3,64-8
54 blr 54 blr
55 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50) 55 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 50)
@@ -66,7 +66,7 @@ BEGIN_FTR_SECTION
66 nop 66 nop
67FTR_SECTION_ELSE 67FTR_SECTION_ELSE
68 BEGIN_FTR_SECTION_NESTED(51) 68 BEGIN_FTR_SECTION_NESTED(51)
69 PPC_POPCNTB(r3,r3) 69 PPC_POPCNTB(R3,R3)
70 srdi r4,r3,16 70 srdi r4,r3,16
71 add r3,r4,r3 71 add r3,r4,r3
72 srdi r4,r3,8 72 srdi r4,r3,8
@@ -74,7 +74,7 @@ FTR_SECTION_ELSE
74 clrldi r3,r3,64-8 74 clrldi r3,r3,64-8
75 blr 75 blr
76 FTR_SECTION_ELSE_NESTED(51) 76 FTR_SECTION_ELSE_NESTED(51)
77 PPC_POPCNTW(r3,r3) 77 PPC_POPCNTW(R3,R3)
78 clrldi r3,r3,64-8 78 clrldi r3,r3,64-8
79 blr 79 blr
80 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51) 80 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 51)
@@ -93,7 +93,7 @@ BEGIN_FTR_SECTION
93 nop 93 nop
94FTR_SECTION_ELSE 94FTR_SECTION_ELSE
95 BEGIN_FTR_SECTION_NESTED(52) 95 BEGIN_FTR_SECTION_NESTED(52)
96 PPC_POPCNTB(r3,r3) 96 PPC_POPCNTB(R3,R3)
97 srdi r4,r3,32 97 srdi r4,r3,32
98 add r3,r4,r3 98 add r3,r4,r3
99 srdi r4,r3,16 99 srdi r4,r3,16
@@ -103,7 +103,7 @@ FTR_SECTION_ELSE
103 clrldi r3,r3,64-8 103 clrldi r3,r3,64-8
104 blr 104 blr
105 FTR_SECTION_ELSE_NESTED(52) 105 FTR_SECTION_ELSE_NESTED(52)
106 PPC_POPCNTD(r3,r3) 106 PPC_POPCNTD(R3,R3)
107 clrldi r3,r3,64-8 107 clrldi r3,r3,64-8
108 blr 108 blr
109 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52) 109 ALT_FTR_SECTION_END_NESTED_IFCLR(CPU_FTR_POPCNTD, 52)
diff --git a/arch/powerpc/lib/ldstfp.S b/arch/powerpc/lib/ldstfp.S
index 6a85380520b6..85aec08ab234 100644
--- a/arch/powerpc/lib/ldstfp.S
+++ b/arch/powerpc/lib/ldstfp.S
@@ -330,13 +330,13 @@ _GLOBAL(do_lxvd2x)
330 MTMSRD(r7) 330 MTMSRD(r7)
331 isync 331 isync
332 beq cr7,1f 332 beq cr7,1f
333 STXVD2X(0,r1,r8) 333 STXVD2X(0,R1,R8)
3341: li r9,-EFAULT 3341: li r9,-EFAULT
3352: LXVD2X(0,0,r4) 3352: LXVD2X(0,R0,R4)
336 li r9,0 336 li r9,0
3373: beq cr7,4f 3373: beq cr7,4f
338 bl put_vsr 338 bl put_vsr
339 LXVD2X(0,r1,r8) 339 LXVD2X(0,R1,R8)
3404: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) 3404: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1)
341 mtlr r0 341 mtlr r0
342 MTMSRD(r6) 342 MTMSRD(r6)
@@ -358,13 +358,13 @@ _GLOBAL(do_stxvd2x)
358 MTMSRD(r7) 358 MTMSRD(r7)
359 isync 359 isync
360 beq cr7,1f 360 beq cr7,1f
361 STXVD2X(0,r1,r8) 361 STXVD2X(0,R1,R8)
362 bl get_vsr 362 bl get_vsr
3631: li r9,-EFAULT 3631: li r9,-EFAULT
3642: STXVD2X(0,0,r4) 3642: STXVD2X(0,R0,R4)
365 li r9,0 365 li r9,0
3663: beq cr7,4f 3663: beq cr7,4f
367 LXVD2X(0,r1,r8) 367 LXVD2X(0,R1,R8)
3684: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1) 3684: PPC_LL r0,STKFRM+PPC_LR_STKOFF(r1)
369 mtlr r0 369 mtlr r0
370 MTMSRD(r6) 370 MTMSRD(r6)
diff --git a/arch/powerpc/lib/memcpy_64.S b/arch/powerpc/lib/memcpy_64.S
index 82fea3963e15..d2bbbc8d7dc0 100644
--- a/arch/powerpc/lib/memcpy_64.S
+++ b/arch/powerpc/lib/memcpy_64.S
@@ -11,7 +11,11 @@
11 11
12 .align 7 12 .align 7
13_GLOBAL(memcpy) 13_GLOBAL(memcpy)
14BEGIN_FTR_SECTION
14 std r3,48(r1) /* save destination pointer for return value */ 15 std r3,48(r1) /* save destination pointer for return value */
16FTR_SECTION_ELSE
17 b memcpy_power7
18ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY)
15 PPC_MTOCRF(0x01,r5) 19 PPC_MTOCRF(0x01,r5)
16 cmpldi cr1,r5,16 20 cmpldi cr1,r5,16
17 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry 21 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry
diff --git a/arch/powerpc/lib/memcpy_power7.S b/arch/powerpc/lib/memcpy_power7.S
new file mode 100644
index 000000000000..0efdc51bc716
--- /dev/null
+++ b/arch/powerpc/lib/memcpy_power7.S
@@ -0,0 +1,647 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
22_GLOBAL(memcpy_power7)
23#ifdef CONFIG_ALTIVEC
24 cmpldi r5,16
25 cmpldi cr1,r5,4096
26
27 std r3,48(r1)
28
29 blt .Lshort_copy
30 bgt cr1,.Lvmx_copy
31#else
32 cmpldi r5,16
33
34 std r3,48(r1)
35
36 blt .Lshort_copy
37#endif
38
39.Lnonvmx_copy:
40 /* Get the source 8B aligned */
41 neg r6,r4
42 mtocrf 0x01,r6
43 clrldi r6,r6,(64-3)
44
45 bf cr7*4+3,1f
46 lbz r0,0(r4)
47 addi r4,r4,1
48 stb r0,0(r3)
49 addi r3,r3,1
50
511: bf cr7*4+2,2f
52 lhz r0,0(r4)
53 addi r4,r4,2
54 sth r0,0(r3)
55 addi r3,r3,2
56
572: bf cr7*4+1,3f
58 lwz r0,0(r4)
59 addi r4,r4,4
60 stw r0,0(r3)
61 addi r3,r3,4
62
633: sub r5,r5,r6
64 cmpldi r5,128
65 blt 5f
66
67 mflr r0
68 stdu r1,-STACKFRAMESIZE(r1)
69 std r14,STK_REG(R14)(r1)
70 std r15,STK_REG(R15)(r1)
71 std r16,STK_REG(R16)(r1)
72 std r17,STK_REG(R17)(r1)
73 std r18,STK_REG(R18)(r1)
74 std r19,STK_REG(R19)(r1)
75 std r20,STK_REG(R20)(r1)
76 std r21,STK_REG(R21)(r1)
77 std r22,STK_REG(R22)(r1)
78 std r0,STACKFRAMESIZE+16(r1)
79
80 srdi r6,r5,7
81 mtctr r6
82
83 /* Now do cacheline (128B) sized loads and stores. */
84 .align 5
854:
86 ld r0,0(r4)
87 ld r6,8(r4)
88 ld r7,16(r4)
89 ld r8,24(r4)
90 ld r9,32(r4)
91 ld r10,40(r4)
92 ld r11,48(r4)
93 ld r12,56(r4)
94 ld r14,64(r4)
95 ld r15,72(r4)
96 ld r16,80(r4)
97 ld r17,88(r4)
98 ld r18,96(r4)
99 ld r19,104(r4)
100 ld r20,112(r4)
101 ld r21,120(r4)
102 addi r4,r4,128
103 std r0,0(r3)
104 std r6,8(r3)
105 std r7,16(r3)
106 std r8,24(r3)
107 std r9,32(r3)
108 std r10,40(r3)
109 std r11,48(r3)
110 std r12,56(r3)
111 std r14,64(r3)
112 std r15,72(r3)
113 std r16,80(r3)
114 std r17,88(r3)
115 std r18,96(r3)
116 std r19,104(r3)
117 std r20,112(r3)
118 std r21,120(r3)
119 addi r3,r3,128
120 bdnz 4b
121
122 clrldi r5,r5,(64-7)
123
124 ld r14,STK_REG(R14)(r1)
125 ld r15,STK_REG(R15)(r1)
126 ld r16,STK_REG(R16)(r1)
127 ld r17,STK_REG(R17)(r1)
128 ld r18,STK_REG(R18)(r1)
129 ld r19,STK_REG(R19)(r1)
130 ld r20,STK_REG(R20)(r1)
131 ld r21,STK_REG(R21)(r1)
132 ld r22,STK_REG(R22)(r1)
133 addi r1,r1,STACKFRAMESIZE
134
135 /* Up to 127B to go */
1365: srdi r6,r5,4
137 mtocrf 0x01,r6
138
1396: bf cr7*4+1,7f
140 ld r0,0(r4)
141 ld r6,8(r4)
142 ld r7,16(r4)
143 ld r8,24(r4)
144 ld r9,32(r4)
145 ld r10,40(r4)
146 ld r11,48(r4)
147 ld r12,56(r4)
148 addi r4,r4,64
149 std r0,0(r3)
150 std r6,8(r3)
151 std r7,16(r3)
152 std r8,24(r3)
153 std r9,32(r3)
154 std r10,40(r3)
155 std r11,48(r3)
156 std r12,56(r3)
157 addi r3,r3,64
158
159 /* Up to 63B to go */
1607: bf cr7*4+2,8f
161 ld r0,0(r4)
162 ld r6,8(r4)
163 ld r7,16(r4)
164 ld r8,24(r4)
165 addi r4,r4,32
166 std r0,0(r3)
167 std r6,8(r3)
168 std r7,16(r3)
169 std r8,24(r3)
170 addi r3,r3,32
171
172 /* Up to 31B to go */
1738: bf cr7*4+3,9f
174 ld r0,0(r4)
175 ld r6,8(r4)
176 addi r4,r4,16
177 std r0,0(r3)
178 std r6,8(r3)
179 addi r3,r3,16
180
1819: clrldi r5,r5,(64-4)
182
183 /* Up to 15B to go */
184.Lshort_copy:
185 mtocrf 0x01,r5
186 bf cr7*4+0,12f
187 lwz r0,0(r4) /* Less chance of a reject with word ops */
188 lwz r6,4(r4)
189 addi r4,r4,8
190 stw r0,0(r3)
191 stw r6,4(r3)
192 addi r3,r3,8
193
19412: bf cr7*4+1,13f
195 lwz r0,0(r4)
196 addi r4,r4,4
197 stw r0,0(r3)
198 addi r3,r3,4
199
20013: bf cr7*4+2,14f
201 lhz r0,0(r4)
202 addi r4,r4,2
203 sth r0,0(r3)
204 addi r3,r3,2
205
20614: bf cr7*4+3,15f
207 lbz r0,0(r4)
208 stb r0,0(r3)
209
21015: ld r3,48(r1)
211 blr
212
213.Lunwind_stack_nonvmx_copy:
214 addi r1,r1,STACKFRAMESIZE
215 b .Lnonvmx_copy
216
217#ifdef CONFIG_ALTIVEC
218.Lvmx_copy:
219 mflr r0
220 std r4,56(r1)
221 std r5,64(r1)
222 std r0,16(r1)
223 stdu r1,-STACKFRAMESIZE(r1)
224 bl .enter_vmx_copy
225 cmpwi r3,0
226 ld r0,STACKFRAMESIZE+16(r1)
227 ld r3,STACKFRAMESIZE+48(r1)
228 ld r4,STACKFRAMESIZE+56(r1)
229 ld r5,STACKFRAMESIZE+64(r1)
230 mtlr r0
231
232 /*
233 * We prefetch both the source and destination using enhanced touch
234 * instructions. We use a stream ID of 0 for the load side and
235 * 1 for the store side.
236 */
237 clrrdi r6,r4,7
238 clrrdi r9,r3,7
239 ori r9,r9,1 /* stream=1 */
240
241 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
242 cmpldi cr1,r7,0x3FF
243 ble cr1,1f
244 li r7,0x3FF
2451: lis r0,0x0E00 /* depth=7 */
246 sldi r7,r7,7
247 or r7,r7,r0
248 ori r10,r7,1 /* stream=1 */
249
250 lis r8,0x8000 /* GO=1 */
251 clrldi r8,r8,32
252
253.machine push
254.machine "power4"
255 dcbt r0,r6,0b01000
256 dcbt r0,r7,0b01010
257 dcbtst r0,r9,0b01000
258 dcbtst r0,r10,0b01010
259 eieio
260 dcbt r0,r8,0b01010 /* GO */
261.machine pop
262
263 beq .Lunwind_stack_nonvmx_copy
264
265 /*
266 * If source and destination are not relatively aligned we use a
267 * slower permute loop.
268 */
269 xor r6,r4,r3
270 rldicl. r6,r6,0,(64-4)
271 bne .Lvmx_unaligned_copy
272
273 /* Get the destination 16B aligned */
274 neg r6,r3
275 mtocrf 0x01,r6
276 clrldi r6,r6,(64-4)
277
278 bf cr7*4+3,1f
279 lbz r0,0(r4)
280 addi r4,r4,1
281 stb r0,0(r3)
282 addi r3,r3,1
283
2841: bf cr7*4+2,2f
285 lhz r0,0(r4)
286 addi r4,r4,2
287 sth r0,0(r3)
288 addi r3,r3,2
289
2902: bf cr7*4+1,3f
291 lwz r0,0(r4)
292 addi r4,r4,4
293 stw r0,0(r3)
294 addi r3,r3,4
295
2963: bf cr7*4+0,4f
297 ld r0,0(r4)
298 addi r4,r4,8
299 std r0,0(r3)
300 addi r3,r3,8
301
3024: sub r5,r5,r6
303
304 /* Get the desination 128B aligned */
305 neg r6,r3
306 srdi r7,r6,4
307 mtocrf 0x01,r7
308 clrldi r6,r6,(64-7)
309
310 li r9,16
311 li r10,32
312 li r11,48
313
314 bf cr7*4+3,5f
315 lvx vr1,r0,r4
316 addi r4,r4,16
317 stvx vr1,r0,r3
318 addi r3,r3,16
319
3205: bf cr7*4+2,6f
321 lvx vr1,r0,r4
322 lvx vr0,r4,r9
323 addi r4,r4,32
324 stvx vr1,r0,r3
325 stvx vr0,r3,r9
326 addi r3,r3,32
327
3286: bf cr7*4+1,7f
329 lvx vr3,r0,r4
330 lvx vr2,r4,r9
331 lvx vr1,r4,r10
332 lvx vr0,r4,r11
333 addi r4,r4,64
334 stvx vr3,r0,r3
335 stvx vr2,r3,r9
336 stvx vr1,r3,r10
337 stvx vr0,r3,r11
338 addi r3,r3,64
339
3407: sub r5,r5,r6
341 srdi r6,r5,7
342
343 std r14,STK_REG(R14)(r1)
344 std r15,STK_REG(R15)(r1)
345 std r16,STK_REG(R16)(r1)
346
347 li r12,64
348 li r14,80
349 li r15,96
350 li r16,112
351
352 mtctr r6
353
354 /*
355 * Now do cacheline sized loads and stores. By this stage the
356 * cacheline stores are also cacheline aligned.
357 */
358 .align 5
3598:
360 lvx vr7,r0,r4
361 lvx vr6,r4,r9
362 lvx vr5,r4,r10
363 lvx vr4,r4,r11
364 lvx vr3,r4,r12
365 lvx vr2,r4,r14
366 lvx vr1,r4,r15
367 lvx vr0,r4,r16
368 addi r4,r4,128
369 stvx vr7,r0,r3
370 stvx vr6,r3,r9
371 stvx vr5,r3,r10
372 stvx vr4,r3,r11
373 stvx vr3,r3,r12
374 stvx vr2,r3,r14
375 stvx vr1,r3,r15
376 stvx vr0,r3,r16
377 addi r3,r3,128
378 bdnz 8b
379
380 ld r14,STK_REG(R14)(r1)
381 ld r15,STK_REG(R15)(r1)
382 ld r16,STK_REG(R16)(r1)
383
384 /* Up to 127B to go */
385 clrldi r5,r5,(64-7)
386 srdi r6,r5,4
387 mtocrf 0x01,r6
388
389 bf cr7*4+1,9f
390 lvx vr3,r0,r4
391 lvx vr2,r4,r9
392 lvx vr1,r4,r10
393 lvx vr0,r4,r11
394 addi r4,r4,64
395 stvx vr3,r0,r3
396 stvx vr2,r3,r9
397 stvx vr1,r3,r10
398 stvx vr0,r3,r11
399 addi r3,r3,64
400
4019: bf cr7*4+2,10f
402 lvx vr1,r0,r4
403 lvx vr0,r4,r9
404 addi r4,r4,32
405 stvx vr1,r0,r3
406 stvx vr0,r3,r9
407 addi r3,r3,32
408
40910: bf cr7*4+3,11f
410 lvx vr1,r0,r4
411 addi r4,r4,16
412 stvx vr1,r0,r3
413 addi r3,r3,16
414
415 /* Up to 15B to go */
41611: clrldi r5,r5,(64-4)
417 mtocrf 0x01,r5
418 bf cr7*4+0,12f
419 ld r0,0(r4)
420 addi r4,r4,8
421 std r0,0(r3)
422 addi r3,r3,8
423
42412: bf cr7*4+1,13f
425 lwz r0,0(r4)
426 addi r4,r4,4
427 stw r0,0(r3)
428 addi r3,r3,4
429
43013: bf cr7*4+2,14f
431 lhz r0,0(r4)
432 addi r4,r4,2
433 sth r0,0(r3)
434 addi r3,r3,2
435
43614: bf cr7*4+3,15f
437 lbz r0,0(r4)
438 stb r0,0(r3)
439
44015: addi r1,r1,STACKFRAMESIZE
441 ld r3,48(r1)
442 b .exit_vmx_copy /* tail call optimise */
443
444.Lvmx_unaligned_copy:
445 /* Get the destination 16B aligned */
446 neg r6,r3
447 mtocrf 0x01,r6
448 clrldi r6,r6,(64-4)
449
450 bf cr7*4+3,1f
451 lbz r0,0(r4)
452 addi r4,r4,1
453 stb r0,0(r3)
454 addi r3,r3,1
455
4561: bf cr7*4+2,2f
457 lhz r0,0(r4)
458 addi r4,r4,2
459 sth r0,0(r3)
460 addi r3,r3,2
461
4622: bf cr7*4+1,3f
463 lwz r0,0(r4)
464 addi r4,r4,4
465 stw r0,0(r3)
466 addi r3,r3,4
467
4683: bf cr7*4+0,4f
469 lwz r0,0(r4) /* Less chance of a reject with word ops */
470 lwz r7,4(r4)
471 addi r4,r4,8
472 stw r0,0(r3)
473 stw r7,4(r3)
474 addi r3,r3,8
475
4764: sub r5,r5,r6
477
478 /* Get the desination 128B aligned */
479 neg r6,r3
480 srdi r7,r6,4
481 mtocrf 0x01,r7
482 clrldi r6,r6,(64-7)
483
484 li r9,16
485 li r10,32
486 li r11,48
487
488 lvsl vr16,0,r4 /* Setup permute control vector */
489 lvx vr0,0,r4
490 addi r4,r4,16
491
492 bf cr7*4+3,5f
493 lvx vr1,r0,r4
494 vperm vr8,vr0,vr1,vr16
495 addi r4,r4,16
496 stvx vr8,r0,r3
497 addi r3,r3,16
498 vor vr0,vr1,vr1
499
5005: bf cr7*4+2,6f
501 lvx vr1,r0,r4
502 vperm vr8,vr0,vr1,vr16
503 lvx vr0,r4,r9
504 vperm vr9,vr1,vr0,vr16
505 addi r4,r4,32
506 stvx vr8,r0,r3
507 stvx vr9,r3,r9
508 addi r3,r3,32
509
5106: bf cr7*4+1,7f
511 lvx vr3,r0,r4
512 vperm vr8,vr0,vr3,vr16
513 lvx vr2,r4,r9
514 vperm vr9,vr3,vr2,vr16
515 lvx vr1,r4,r10
516 vperm vr10,vr2,vr1,vr16
517 lvx vr0,r4,r11
518 vperm vr11,vr1,vr0,vr16
519 addi r4,r4,64
520 stvx vr8,r0,r3
521 stvx vr9,r3,r9
522 stvx vr10,r3,r10
523 stvx vr11,r3,r11
524 addi r3,r3,64
525
5267: sub r5,r5,r6
527 srdi r6,r5,7
528
529 std r14,STK_REG(R14)(r1)
530 std r15,STK_REG(R15)(r1)
531 std r16,STK_REG(R16)(r1)
532
533 li r12,64
534 li r14,80
535 li r15,96
536 li r16,112
537
538 mtctr r6
539
540 /*
541 * Now do cacheline sized loads and stores. By this stage the
542 * cacheline stores are also cacheline aligned.
543 */
544 .align 5
5458:
546 lvx vr7,r0,r4
547 vperm vr8,vr0,vr7,vr16
548 lvx vr6,r4,r9
549 vperm vr9,vr7,vr6,vr16
550 lvx vr5,r4,r10
551 vperm vr10,vr6,vr5,vr16
552 lvx vr4,r4,r11
553 vperm vr11,vr5,vr4,vr16
554 lvx vr3,r4,r12
555 vperm vr12,vr4,vr3,vr16
556 lvx vr2,r4,r14
557 vperm vr13,vr3,vr2,vr16
558 lvx vr1,r4,r15
559 vperm vr14,vr2,vr1,vr16
560 lvx vr0,r4,r16
561 vperm vr15,vr1,vr0,vr16
562 addi r4,r4,128
563 stvx vr8,r0,r3
564 stvx vr9,r3,r9
565 stvx vr10,r3,r10
566 stvx vr11,r3,r11
567 stvx vr12,r3,r12
568 stvx vr13,r3,r14
569 stvx vr14,r3,r15
570 stvx vr15,r3,r16
571 addi r3,r3,128
572 bdnz 8b
573
574 ld r14,STK_REG(R14)(r1)
575 ld r15,STK_REG(R15)(r1)
576 ld r16,STK_REG(R16)(r1)
577
578 /* Up to 127B to go */
579 clrldi r5,r5,(64-7)
580 srdi r6,r5,4
581 mtocrf 0x01,r6
582
583 bf cr7*4+1,9f
584 lvx vr3,r0,r4
585 vperm vr8,vr0,vr3,vr16
586 lvx vr2,r4,r9
587 vperm vr9,vr3,vr2,vr16
588 lvx vr1,r4,r10
589 vperm vr10,vr2,vr1,vr16
590 lvx vr0,r4,r11
591 vperm vr11,vr1,vr0,vr16
592 addi r4,r4,64
593 stvx vr8,r0,r3
594 stvx vr9,r3,r9
595 stvx vr10,r3,r10
596 stvx vr11,r3,r11
597 addi r3,r3,64
598
5999: bf cr7*4+2,10f
600 lvx vr1,r0,r4
601 vperm vr8,vr0,vr1,vr16
602 lvx vr0,r4,r9
603 vperm vr9,vr1,vr0,vr16
604 addi r4,r4,32
605 stvx vr8,r0,r3
606 stvx vr9,r3,r9
607 addi r3,r3,32
608
60910: bf cr7*4+3,11f
610 lvx vr1,r0,r4
611 vperm vr8,vr0,vr1,vr16
612 addi r4,r4,16
613 stvx vr8,r0,r3
614 addi r3,r3,16
615
616 /* Up to 15B to go */
61711: clrldi r5,r5,(64-4)
618 addi r4,r4,-16 /* Unwind the +16 load offset */
619 mtocrf 0x01,r5
620 bf cr7*4+0,12f
621 lwz r0,0(r4) /* Less chance of a reject with word ops */
622 lwz r6,4(r4)
623 addi r4,r4,8
624 stw r0,0(r3)
625 stw r6,4(r3)
626 addi r3,r3,8
627
62812: bf cr7*4+1,13f
629 lwz r0,0(r4)
630 addi r4,r4,4
631 stw r0,0(r3)
632 addi r3,r3,4
633
63413: bf cr7*4+2,14f
635 lhz r0,0(r4)
636 addi r4,r4,2
637 sth r0,0(r3)
638 addi r3,r3,2
639
64014: bf cr7*4+3,15f
641 lbz r0,0(r4)
642 stb r0,0(r3)
643
64415: addi r1,r1,STACKFRAMESIZE
645 ld r3,48(r1)
646 b .exit_vmx_copy /* tail call optimise */
647#endif /* CONFiG_ALTIVEC */
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S
index 093d6316435c..1b5a0a09d609 100644
--- a/arch/powerpc/lib/string.S
+++ b/arch/powerpc/lib/string.S
@@ -119,6 +119,7 @@ _GLOBAL(memchr)
1192: li r3,0 1192: li r3,0
120 blr 120 blr
121 121
122#ifdef CONFIG_PPC32
122_GLOBAL(__clear_user) 123_GLOBAL(__clear_user)
123 addi r6,r3,-4 124 addi r6,r3,-4
124 li r3,0 125 li r3,0
@@ -160,3 +161,4 @@ _GLOBAL(__clear_user)
160 PPC_LONG 1b,91b 161 PPC_LONG 1b,91b
161 PPC_LONG 8b,92b 162 PPC_LONG 8b,92b
162 .text 163 .text
164#endif
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S
new file mode 100644
index 000000000000..3b1e48049faf
--- /dev/null
+++ b/arch/powerpc/lib/string_64.S
@@ -0,0 +1,202 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2012
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20
21#include <asm/ppc_asm.h>
22#include <asm/asm-offsets.h>
23
24 .section ".toc","aw"
25PPC64_CACHES:
26 .tc ppc64_caches[TC],ppc64_caches
27 .section ".text"
28
29/**
30 * __clear_user: - Zero a block of memory in user space, with less checking.
31 * @to: Destination address, in user space.
32 * @n: Number of bytes to zero.
33 *
34 * Zero a block of memory in user space. Caller must check
35 * the specified block with access_ok() before calling this function.
36 *
37 * Returns number of bytes that could not be cleared.
38 * On success, this will be zero.
39 */
40
41 .macro err1
42100:
43 .section __ex_table,"a"
44 .align 3
45 .llong 100b,.Ldo_err1
46 .previous
47 .endm
48
49 .macro err2
50200:
51 .section __ex_table,"a"
52 .align 3
53 .llong 200b,.Ldo_err2
54 .previous
55 .endm
56
57 .macro err3
58300:
59 .section __ex_table,"a"
60 .align 3
61 .llong 300b,.Ldo_err3
62 .previous
63 .endm
64
65.Ldo_err1:
66 mr r3,r8
67
68.Ldo_err2:
69 mtctr r4
701:
71err3; stb r0,0(r3)
72 addi r3,r3,1
73 addi r4,r4,-1
74 bdnz 1b
75
76.Ldo_err3:
77 mr r3,r4
78 blr
79
80_GLOBAL(__clear_user)
81 cmpdi r4,32
82 neg r6,r3
83 li r0,0
84 blt .Lshort_clear
85 mr r8,r3
86 mtocrf 0x01,r6
87 clrldi r6,r6,(64-3)
88
89 /* Get the destination 8 byte aligned */
90 bf cr7*4+3,1f
91err1; stb r0,0(r3)
92 addi r3,r3,1
93
941: bf cr7*4+2,2f
95err1; sth r0,0(r3)
96 addi r3,r3,2
97
982: bf cr7*4+1,3f
99err1; stw r0,0(r3)
100 addi r3,r3,4
101
1023: sub r4,r4,r6
103
104 cmpdi r4,32
105 cmpdi cr1,r4,512
106 blt .Lshort_clear
107 bgt cr1,.Llong_clear
108
109.Lmedium_clear:
110 srdi r6,r4,5
111 mtctr r6
112
113 /* Do 32 byte chunks */
1144:
115err2; std r0,0(r3)
116err2; std r0,8(r3)
117err2; std r0,16(r3)
118err2; std r0,24(r3)
119 addi r3,r3,32
120 addi r4,r4,-32
121 bdnz 4b
122
123.Lshort_clear:
124 /* up to 31 bytes to go */
125 cmpdi r4,16
126 blt 6f
127err2; std r0,0(r3)
128err2; std r0,8(r3)
129 addi r3,r3,16
130 addi r4,r4,-16
131
132 /* Up to 15 bytes to go */
1336: mr r8,r3
134 clrldi r4,r4,(64-4)
135 mtocrf 0x01,r4
136 bf cr7*4+0,7f
137err1; std r0,0(r3)
138 addi r3,r3,8
139
1407: bf cr7*4+1,8f
141err1; stw r0,0(r3)
142 addi r3,r3,4
143
1448: bf cr7*4+2,9f
145err1; sth r0,0(r3)
146 addi r3,r3,2
147
1489: bf cr7*4+3,10f
149err1; stb r0,0(r3)
150
15110: li r3,0
152 blr
153
154.Llong_clear:
155 ld r5,PPC64_CACHES@toc(r2)
156
157 bf cr7*4+0,11f
158err2; std r0,0(r3)
159 addi r3,r3,8
160 addi r4,r4,-8
161
162 /* Destination is 16 byte aligned, need to get it cacheline aligned */
16311: lwz r7,DCACHEL1LOGLINESIZE(r5)
164 lwz r9,DCACHEL1LINESIZE(r5)
165
166 /*
167 * With worst case alignment the long clear loop takes a minimum
168 * of 1 byte less than 2 cachelines.
169 */
170 sldi r10,r9,2
171 cmpd r4,r10
172 blt .Lmedium_clear
173
174 neg r6,r3
175 addi r10,r9,-1
176 and. r5,r6,r10
177 beq 13f
178
179 srdi r6,r5,4
180 mtctr r6
181 mr r8,r3
18212:
183err1; std r0,0(r3)
184err1; std r0,8(r3)
185 addi r3,r3,16
186 bdnz 12b
187
188 sub r4,r4,r5
189
19013: srd r6,r4,r7
191 mtctr r6
192 mr r8,r3
19314:
194err1; dcbz r0,r3
195 add r3,r3,r9
196 bdnz 14b
197
198 and r4,r4,r10
199
200 cmpdi r4,32
201 blt .Lshort_clear
202 b .Lmedium_clear
diff --git a/arch/powerpc/lib/copyuser_power7_vmx.c b/arch/powerpc/lib/vmx-helper.c
index bf2654f2b68e..3cf529ceec5b 100644
--- a/arch/powerpc/lib/copyuser_power7_vmx.c
+++ b/arch/powerpc/lib/vmx-helper.c
@@ -22,7 +22,7 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <asm/switch_to.h> 23#include <asm/switch_to.h>
24 24
25int enter_vmx_copy(void) 25int enter_vmx_usercopy(void)
26{ 26{
27 if (in_interrupt()) 27 if (in_interrupt())
28 return 0; 28 return 0;
@@ -44,8 +44,31 @@ int enter_vmx_copy(void)
44 * This function must return 0 because we tail call optimise when calling 44 * This function must return 0 because we tail call optimise when calling
45 * from __copy_tofrom_user_power7 which returns 0 on success. 45 * from __copy_tofrom_user_power7 which returns 0 on success.
46 */ 46 */
47int exit_vmx_copy(void) 47int exit_vmx_usercopy(void)
48{ 48{
49 pagefault_enable(); 49 pagefault_enable();
50 return 0; 50 return 0;
51} 51}
52
53int enter_vmx_copy(void)
54{
55 if (in_interrupt())
56 return 0;
57
58 preempt_disable();
59
60 enable_kernel_altivec();
61
62 return 1;
63}
64
65/*
66 * All calls to this function will be optimised into tail calls. We are
67 * passed a pointer to the destination which we return as required by a
68 * memcpy implementation.
69 */
70void *exit_vmx_copy(void *dest)
71{
72 preempt_enable();
73 return dest;
74}