aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-09-27 00:11:01 -0400
committerDavid S. Miller <davem@davemloft.net>2012-09-27 03:35:11 -0400
commitae2c6ca64118b934ef85f66adb03d5bbfdd57201 (patch)
tree5eb9a50cce32cadd527d5fc92095c76c00b72bae
parentda201161662b8ee9c8d7bd8cc50ce3cb3366d400 (diff)
sparc64: Add SPARC-T4 optimized memcpy.
Before After -------------- -------------- bw_tcp: 1288.53 MB/sec 1637.77 MB/sec bw_pipe: 1517.18 MB/sec 2107.61 MB/sec bw_unix: 1838.38 MB/sec 2640.91 MB/sec make -s -j128 allmodconfig 5min 49sec 5min 31sec Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--arch/sparc/kernel/head_64.S14
-rw-r--r--arch/sparc/lib/Makefile3
-rw-r--r--arch/sparc/lib/NG4copy_from_user.S30
-rw-r--r--arch/sparc/lib/NG4copy_page.S57
-rw-r--r--arch/sparc/lib/NG4copy_to_user.S39
-rw-r--r--arch/sparc/lib/NG4memcpy.S360
-rw-r--r--arch/sparc/lib/NG4patch.S43
-rw-r--r--arch/sparc/lib/NGpage.S2
8 files changed, 546 insertions, 2 deletions
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index b42ddbf9651e..ee5dcced2499 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -559,10 +559,10 @@ niagara_tlb_fixup:
559 be,pt %xcc, niagara2_patch 559 be,pt %xcc, niagara2_patch
560 nop 560 nop
561 cmp %g1, SUN4V_CHIP_NIAGARA4 561 cmp %g1, SUN4V_CHIP_NIAGARA4
562 be,pt %xcc, niagara2_patch 562 be,pt %xcc, niagara4_patch
563 nop 563 nop
564 cmp %g1, SUN4V_CHIP_NIAGARA5 564 cmp %g1, SUN4V_CHIP_NIAGARA5
565 be,pt %xcc, niagara2_patch 565 be,pt %xcc, niagara4_patch
566 nop 566 nop
567 567
568 call generic_patch_copyops 568 call generic_patch_copyops
@@ -573,6 +573,16 @@ niagara_tlb_fixup:
573 nop 573 nop
574 574
575 ba,a,pt %xcc, 80f 575 ba,a,pt %xcc, 80f
576niagara4_patch:
577 call niagara4_patch_copyops
578 nop
579 call niagara_patch_bzero
580 nop
581 call niagara4_patch_pageops
582 nop
583
584 ba,a,pt %xcc, 80f
585
576niagara2_patch: 586niagara2_patch:
577 call niagara2_patch_copyops 587 call niagara2_patch_copyops
578 nop 588 nop
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile
index dff4096f3dec..30f6ab51c551 100644
--- a/arch/sparc/lib/Makefile
+++ b/arch/sparc/lib/Makefile
@@ -32,6 +32,9 @@ lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o
32lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o 32lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o
33lib-$(CONFIG_SPARC64) += NG2patch.o 33lib-$(CONFIG_SPARC64) += NG2patch.o
34 34
35lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o
36lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o
37
35lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o 38lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o
36lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o 39lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o
37 40
diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S
new file mode 100644
index 000000000000..c8e98307ef59
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_from_user.S
@@ -0,0 +1,30 @@
1/* NG4copy_from_user.S: Niagara-2 optimized copy from userspace.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6#define EX_LD(x) \
798: x; \
8 .section __ex_table,"a";\
9 .align 4; \
10 .word 98b, __retl_one_asi;\
11 .text; \
12 .align 4;
13
14#ifndef ASI_AIUS
15#define ASI_AIUS 0x11
16#endif
17
18#define FUNC_NAME NG4copy_from_user
19#define LOAD(type,addr,dest) type##a [addr] %asi, dest
20#define EX_RETVAL(x) 0
21
22#ifdef __KERNEL__
23#define PREAMBLE \
24 rd %asi, %g1; \
25 cmp %g1, ASI_AIUS; \
26 bne,pn %icc, ___copy_in_user; \
27 nop
28#endif
29
30#include "NG4memcpy.S"
diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S
new file mode 100644
index 000000000000..f30ec10bbcac
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_page.S
@@ -0,0 +1,57 @@
1/* NG4copy_page.S: Niagara-4 optimized copy page.
2 *
3 * Copyright (C) 2012 (davem@davemloft.net)
4 */
5
6#include <asm/asi.h>
7#include <asm/page.h>
8
9 .text
10 .align 32
11
12 .register %g2, #scratch
13 .register %g3, #scratch
14
15 .globl NG4copy_user_page
16NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */
17 prefetch [%o1 + 0x000], #n_reads_strong
18 prefetch [%o1 + 0x040], #n_reads_strong
19 prefetch [%o1 + 0x080], #n_reads_strong
20 prefetch [%o1 + 0x0c0], #n_reads_strong
21 set PAGE_SIZE, %g7
22 prefetch [%o1 + 0x100], #n_reads_strong
23 prefetch [%o1 + 0x140], #n_reads_strong
24 prefetch [%o1 + 0x180], #n_reads_strong
25 prefetch [%o1 + 0x1c0], #n_reads_strong
261:
27 ldx [%o1 + 0x00], %o2
28 subcc %g7, 0x40, %g7
29 ldx [%o1 + 0x08], %o3
30 ldx [%o1 + 0x10], %o4
31 ldx [%o1 + 0x18], %o5
32 ldx [%o1 + 0x20], %g1
33 stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
34 add %o0, 0x08, %o0
35 ldx [%o1 + 0x28], %g2
36 stxa %o3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
37 add %o0, 0x08, %o0
38 ldx [%o1 + 0x30], %g3
39 stxa %o4, [%o0] ASI_BLK_INIT_QUAD_LDD_P
40 add %o0, 0x08, %o0
41 ldx [%o1 + 0x38], %o2
42 add %o1, 0x40, %o1
43 stxa %o5, [%o0] ASI_BLK_INIT_QUAD_LDD_P
44 add %o0, 0x08, %o0
45 stxa %g1, [%o0] ASI_BLK_INIT_QUAD_LDD_P
46 add %o0, 0x08, %o0
47 stxa %g2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
48 add %o0, 0x08, %o0
49 stxa %g3, [%o0] ASI_BLK_INIT_QUAD_LDD_P
50 add %o0, 0x08, %o0
51 stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P
52 add %o0, 0x08, %o0
53 bne,pt %icc, 1b
54 prefetch [%o1 + 0x200], #n_reads_strong
55 retl
56 membar #StoreLoad | #StoreStore
57 .size NG4copy_user_page,.-NG4copy_user_page
diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S
new file mode 100644
index 000000000000..9744c4540a8d
--- /dev/null
+++ b/arch/sparc/lib/NG4copy_to_user.S
@@ -0,0 +1,39 @@
1/* NG4copy_to_user.S: Niagara-4 optimized copy to userspace.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6#define EX_ST(x) \
798: x; \
8 .section __ex_table,"a";\
9 .align 4; \
10 .word 98b, __retl_one_asi;\
11 .text; \
12 .align 4;
13
14#ifndef ASI_AIUS
15#define ASI_AIUS 0x11
16#endif
17
18#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
19#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
20#endif
21
22#define FUNC_NAME NG4copy_to_user
23#define STORE(type,src,addr) type##a src, [addr] %asi
24#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS
25#define EX_RETVAL(x) 0
26
27#ifdef __KERNEL__
28 /* Writing to %asi is _expensive_ so we hardcode it.
29 * Reading %asi to check for KERNEL_DS is comparatively
30 * cheap.
31 */
32#define PREAMBLE \
33 rd %asi, %g1; \
34 cmp %g1, ASI_AIUS; \
35 bne,pn %icc, ___copy_in_user; \
36 nop
37#endif
38
39#include "NG4memcpy.S"
diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S
new file mode 100644
index 000000000000..6f4b526b3c4f
--- /dev/null
+++ b/arch/sparc/lib/NG4memcpy.S
@@ -0,0 +1,360 @@
1/* NG4memcpy.S: Niagara-4 optimized memcpy.
2 *
3 * Copyright (C) 2012 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
7#include <asm/visasm.h>
8#include <asm/asi.h>
9#define GLOBAL_SPARE %g7
10#else
11#define ASI_BLK_INIT_QUAD_LDD_P 0xe2
12#define FPRS_FEF 0x04
13
14/* On T4 it is very expensive to access ASRs like %fprs and
15 * %asi, avoiding a read or a write can save ~50 cycles.
16 */
17#define FPU_ENTER \
18 rd %fprs, %o5; \
19 andcc %o5, FPRS_FEF, %g0; \
20 be,a,pn %icc, 999f; \
21 wr %g0, FPRS_FEF, %fprs; \
22 999:
23
24#ifdef MEMCPY_DEBUG
25#define VISEntryHalf FPU_ENTER; \
26 clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0;
27#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
28#else
29#define VISEntryHalf FPU_ENTER
30#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs
31#endif
32
33#define GLOBAL_SPARE %g5
34#endif
35
36#ifndef STORE_ASI
37#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
38#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
39#else
40#define STORE_ASI 0x80 /* ASI_P */
41#endif
42#endif
43
44#ifndef EX_LD
45#define EX_LD(x) x
46#endif
47
48#ifndef EX_ST
49#define EX_ST(x) x
50#endif
51
52#ifndef EX_RETVAL
53#define EX_RETVAL(x) x
54#endif
55
56#ifndef LOAD
57#define LOAD(type,addr,dest) type [addr], dest
58#endif
59
60#ifndef STORE
61#ifndef MEMCPY_DEBUG
62#define STORE(type,src,addr) type src, [addr]
63#else
64#define STORE(type,src,addr) type##a src, [addr] %asi
65#endif
66#endif
67
68#ifndef STORE_INIT
69#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
70#endif
71
72#ifndef FUNC_NAME
73#define FUNC_NAME NG4memcpy
74#endif
75#ifndef PREAMBLE
76#define PREAMBLE
77#endif
78
79#ifndef XCC
80#define XCC xcc
81#endif
82
83 .register %g2,#scratch
84 .register %g3,#scratch
85
86 .text
87 .align 64
88
89 .globl FUNC_NAME
90 .type FUNC_NAME,#function
91FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
92#ifdef MEMCPY_DEBUG
93 wr %g0, 0x80, %asi
94#endif
95 srlx %o2, 31, %g2
96 cmp %g2, 0
97 tne %XCC, 5
98 PREAMBLE
99 mov %o0, %o3
100 brz,pn %o2, .Lexit
101 cmp %o2, 3
102 ble,pn %icc, .Ltiny
103 cmp %o2, 19
104 ble,pn %icc, .Lsmall
105 or %o0, %o1, %g2
106 cmp %o2, 128
107 bl,pn %icc, .Lmedium
108 nop
109
110.Llarge:/* len >= 0x80 */
111 /* First get dest 8 byte aligned. */
112 sub %g0, %o0, %g1
113 and %g1, 0x7, %g1
114 brz,pt %g1, 51f
115 sub %o2, %g1, %o2
116
1171: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
118 add %o1, 1, %o1
119 subcc %g1, 1, %g1
120 add %o0, 1, %o0
121 bne,pt %icc, 1b
122 EX_ST(STORE(stb, %g2, %o0 - 0x01))
123
12451: LOAD(prefetch, %o1 + 0x040, #n_reads_strong)
125 LOAD(prefetch, %o1 + 0x080, #n_reads_strong)
126 LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong)
127 LOAD(prefetch, %o1 + 0x100, #n_reads_strong)
128 LOAD(prefetch, %o1 + 0x140, #n_reads_strong)
129 LOAD(prefetch, %o1 + 0x180, #n_reads_strong)
130 LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong)
131 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
132
133 /* Check if we can use the straight fully aligned
134 * loop, or we require the alignaddr/faligndata variant.
135 */
136 andcc %o1, 0x7, %o5
137 bne,pn %icc, .Llarge_src_unaligned
138 sub %g0, %o0, %g1
139
140 /* Legitimize the use of initializing stores by getting dest
141 * to be 64-byte aligned.
142 */
143 and %g1, 0x3f, %g1
144 brz,pt %g1, .Llarge_aligned
145 sub %o2, %g1, %o2
146
1471: EX_LD(LOAD(ldx, %o1 + 0x00, %g2))
148 add %o1, 8, %o1
149 subcc %g1, 8, %g1
150 add %o0, 8, %o0
151 bne,pt %icc, 1b
152 EX_ST(STORE(stx, %g2, %o0 - 0x08))
153
154.Llarge_aligned:
155 /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */
156 andn %o2, 0x3f, %o4
157 sub %o2, %o4, %o2
158
1591: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
160 add %o1, 0x40, %o1
161 EX_LD(LOAD(ldx, %o1 - 0x38, %g2))
162 subcc %o4, 0x40, %o4
163 EX_LD(LOAD(ldx, %o1 - 0x30, %g3))
164 EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE))
165 EX_LD(LOAD(ldx, %o1 - 0x20, %o5))
166 EX_ST(STORE_INIT(%g1, %o0))
167 add %o0, 0x08, %o0
168 EX_ST(STORE_INIT(%g2, %o0))
169 add %o0, 0x08, %o0
170 EX_LD(LOAD(ldx, %o1 - 0x18, %g2))
171 EX_ST(STORE_INIT(%g3, %o0))
172 add %o0, 0x08, %o0
173 EX_LD(LOAD(ldx, %o1 - 0x10, %g3))
174 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
175 add %o0, 0x08, %o0
176 EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE))
177 EX_ST(STORE_INIT(%o5, %o0))
178 add %o0, 0x08, %o0
179 EX_ST(STORE_INIT(%g2, %o0))
180 add %o0, 0x08, %o0
181 EX_ST(STORE_INIT(%g3, %o0))
182 add %o0, 0x08, %o0
183 EX_ST(STORE_INIT(GLOBAL_SPARE, %o0))
184 add %o0, 0x08, %o0
185 bne,pt %icc, 1b
186 LOAD(prefetch, %o1 + 0x200, #n_reads_strong)
187
188 membar #StoreLoad | #StoreStore
189
190 brz,pn %o2, .Lexit
191 cmp %o2, 19
192 ble,pn %icc, .Lsmall_unaligned
193 nop
194 ba,a,pt %icc, .Lmedium_noprefetch
195
196.Lexit: retl
197 mov EX_RETVAL(%o3), %o0
198
199.Llarge_src_unaligned:
200 andn %o2, 0x3f, %o4
201 sub %o2, %o4, %o2
202 VISEntryHalf
203 alignaddr %o1, %g0, %g1
204 add %o1, %o4, %o1
205 EX_LD(LOAD(ldd, %g1 + 0x00, %f0))
2061: EX_LD(LOAD(ldd, %g1 + 0x08, %f2))
207 subcc %o4, 0x40, %o4
208 EX_LD(LOAD(ldd, %g1 + 0x10, %f4))
209 EX_LD(LOAD(ldd, %g1 + 0x18, %f6))
210 EX_LD(LOAD(ldd, %g1 + 0x20, %f8))
211 EX_LD(LOAD(ldd, %g1 + 0x28, %f10))
212 EX_LD(LOAD(ldd, %g1 + 0x30, %f12))
213 EX_LD(LOAD(ldd, %g1 + 0x38, %f14))
214 faligndata %f0, %f2, %f16
215 EX_LD(LOAD(ldd, %g1 + 0x40, %f0))
216 faligndata %f2, %f4, %f18
217 add %g1, 0x40, %g1
218 faligndata %f4, %f6, %f20
219 faligndata %f6, %f8, %f22
220 faligndata %f8, %f10, %f24
221 faligndata %f10, %f12, %f26
222 faligndata %f12, %f14, %f28
223 faligndata %f14, %f0, %f30
224 EX_ST(STORE(std, %f16, %o0 + 0x00))
225 EX_ST(STORE(std, %f18, %o0 + 0x08))
226 EX_ST(STORE(std, %f20, %o0 + 0x10))
227 EX_ST(STORE(std, %f22, %o0 + 0x18))
228 EX_ST(STORE(std, %f24, %o0 + 0x20))
229 EX_ST(STORE(std, %f26, %o0 + 0x28))
230 EX_ST(STORE(std, %f28, %o0 + 0x30))
231 EX_ST(STORE(std, %f30, %o0 + 0x38))
232 add %o0, 0x40, %o0
233 bne,pt %icc, 1b
234 LOAD(prefetch, %g1 + 0x200, #n_reads_strong)
235 VISExitHalf
236
237 brz,pn %o2, .Lexit
238 cmp %o2, 19
239 ble,pn %icc, .Lsmall_unaligned
240 nop
241 ba,a,pt %icc, .Lmedium_unaligned
242
243.Lmedium:
244 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
245 andcc %g2, 0x7, %g0
246 bne,pn %icc, .Lmedium_unaligned
247 nop
248.Lmedium_noprefetch:
249 andncc %o2, 0x20 - 1, %o5
250 be,pn %icc, 2f
251 sub %o2, %o5, %o2
2521: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
253 EX_LD(LOAD(ldx, %o1 + 0x08, %g2))
254 EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE))
255 EX_LD(LOAD(ldx, %o1 + 0x18, %o4))
256 add %o1, 0x20, %o1
257 subcc %o5, 0x20, %o5
258 EX_ST(STORE(stx, %g1, %o0 + 0x00))
259 EX_ST(STORE(stx, %g2, %o0 + 0x08))
260 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10))
261 EX_ST(STORE(stx, %o4, %o0 + 0x18))
262 bne,pt %icc, 1b
263 add %o0, 0x20, %o0
2642: andcc %o2, 0x18, %o5
265 be,pt %icc, 3f
266 sub %o2, %o5, %o2
2671: EX_LD(LOAD(ldx, %o1 + 0x00, %g1))
268 add %o1, 0x08, %o1
269 add %o0, 0x08, %o0
270 subcc %o5, 0x08, %o5
271 bne,pt %icc, 1b
272 EX_ST(STORE(stx, %g1, %o0 - 0x08))
2733: brz,pt %o2, .Lexit
274 cmp %o2, 0x04
275 bl,pn %icc, .Ltiny
276 nop
277 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
278 add %o1, 0x04, %o1
279 add %o0, 0x04, %o0
280 subcc %o2, 0x04, %o2
281 bne,pn %icc, .Ltiny
282 EX_ST(STORE(stw, %g1, %o0 - 0x04))
283 ba,a,pt %icc, .Lexit
284.Lmedium_unaligned:
285 /* First get dest 8 byte aligned. */
286 sub %g0, %o0, %g1
287 and %g1, 0x7, %g1
288 brz,pt %g1, 2f
289 sub %o2, %g1, %o2
290
2911: EX_LD(LOAD(ldub, %o1 + 0x00, %g2))
292 add %o1, 1, %o1
293 subcc %g1, 1, %g1
294 add %o0, 1, %o0
295 bne,pt %icc, 1b
296 EX_ST(STORE(stb, %g2, %o0 - 0x01))
2972:
298 and %o1, 0x7, %g1
299 brz,pn %g1, .Lmedium_noprefetch
300 sll %g1, 3, %g1
301 mov 64, %g2
302 sub %g2, %g1, %g2
303 andn %o1, 0x7, %o1
304 EX_LD(LOAD(ldx, %o1 + 0x00, %o4))
305 sllx %o4, %g1, %o4
306 andn %o2, 0x08 - 1, %o5
307 sub %o2, %o5, %o2
3081: EX_LD(LOAD(ldx, %o1 + 0x08, %g3))
309 add %o1, 0x08, %o1
310 subcc %o5, 0x08, %o5
311 srlx %g3, %g2, GLOBAL_SPARE
312 or GLOBAL_SPARE, %o4, GLOBAL_SPARE
313 EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00))
314 add %o0, 0x08, %o0
315 bne,pt %icc, 1b
316 sllx %g3, %g1, %o4
317 srl %g1, 3, %g1
318 add %o1, %g1, %o1
319 brz,pn %o2, .Lexit
320 nop
321 ba,pt %icc, .Lsmall_unaligned
322
323.Ltiny:
324 EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
325 subcc %o2, 1, %o2
326 be,pn %icc, .Lexit
327 EX_ST(STORE(stb, %g1, %o0 + 0x00))
328 EX_LD(LOAD(ldub, %o1 + 0x01, %g1))
329 subcc %o2, 1, %o2
330 be,pn %icc, .Lexit
331 EX_ST(STORE(stb, %g1, %o0 + 0x01))
332 EX_LD(LOAD(ldub, %o1 + 0x02, %g1))
333 ba,pt %icc, .Lexit
334 EX_ST(STORE(stb, %g1, %o0 + 0x02))
335
336.Lsmall:
337 andcc %g2, 0x3, %g0
338 bne,pn %icc, .Lsmall_unaligned
339 andn %o2, 0x4 - 1, %o5
340 sub %o2, %o5, %o2
3411:
342 EX_LD(LOAD(lduw, %o1 + 0x00, %g1))
343 add %o1, 0x04, %o1
344 subcc %o5, 0x04, %o5
345 add %o0, 0x04, %o0
346 bne,pt %icc, 1b
347 EX_ST(STORE(stw, %g1, %o0 - 0x04))
348 brz,pt %o2, .Lexit
349 nop
350 ba,a,pt %icc, .Ltiny
351
352.Lsmall_unaligned:
3531: EX_LD(LOAD(ldub, %o1 + 0x00, %g1))
354 add %o1, 1, %o1
355 add %o0, 1, %o0
356 subcc %o2, 1, %o2
357 bne,pt %icc, 1b
358 EX_ST(STORE(stb, %g1, %o0 - 0x01))
359 ba,a,pt %icc, .Lexit
360 .size FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S
new file mode 100644
index 000000000000..c21c34c61dda
--- /dev/null
+++ b/arch/sparc/lib/NG4patch.S
@@ -0,0 +1,43 @@
1/* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant.
2 *
3 * Copyright (C) 2012 David S. Miller <davem@davemloft.net>
4 */
5
6#define BRANCH_ALWAYS 0x10680000
7#define NOP 0x01000000
8#define NG_DO_PATCH(OLD, NEW) \
9 sethi %hi(NEW), %g1; \
10 or %g1, %lo(NEW), %g1; \
11 sethi %hi(OLD), %g2; \
12 or %g2, %lo(OLD), %g2; \
13 sub %g1, %g2, %g1; \
14 sethi %hi(BRANCH_ALWAYS), %g3; \
15 sll %g1, 11, %g1; \
16 srl %g1, 11 + 2, %g1; \
17 or %g3, %lo(BRANCH_ALWAYS), %g3; \
18 or %g3, %g1, %g3; \
19 stw %g3, [%g2]; \
20 sethi %hi(NOP), %g3; \
21 or %g3, %lo(NOP), %g3; \
22 stw %g3, [%g2 + 0x4]; \
23 flush %g2;
24
25 .globl niagara4_patch_copyops
26 .type niagara4_patch_copyops,#function
27niagara4_patch_copyops:
28 NG_DO_PATCH(memcpy, NG4memcpy)
29 NG_DO_PATCH(___copy_from_user, NG4copy_from_user)
30 NG_DO_PATCH(___copy_to_user, NG4copy_to_user)
31 retl
32 nop
33 .size niagara4_patch_copyops,.-niagara4_patch_copyops
34
35 .globl niagara4_patch_pageops
36 .type niagara4_patch_pageops,#function
37niagara4_patch_pageops:
38 NG_DO_PATCH(copy_user_page, NG4copy_user_page)
39 NG_DO_PATCH(_clear_page, NGclear_page)
40 NG_DO_PATCH(clear_user_page, NGclear_user_page)
41 retl
42 nop
43 .size niagara4_patch_pageops,.-niagara4_patch_pageops
diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S
index b9e790b9c6b8..423d46e2258b 100644
--- a/arch/sparc/lib/NGpage.S
+++ b/arch/sparc/lib/NGpage.S
@@ -59,6 +59,8 @@ NGcopy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */
59 restore 59 restore
60 60
61 .align 32 61 .align 32
62 .globl NGclear_page
63 .globl NGclear_user_page
62NGclear_page: /* %o0=dest */ 64NGclear_page: /* %o0=dest */
63NGclear_user_page: /* %o0=dest, %o1=vaddr */ 65NGclear_user_page: /* %o0=dest, %o1=vaddr */
64 rd %asi, %g3 66 rd %asi, %g3