diff options
author | David S. Miller <davem@davemloft.net> | 2012-09-27 00:11:01 -0400 |
---|---|---|
committer | David S. Miller <davem@davemloft.net> | 2012-09-27 03:35:11 -0400 |
commit | ae2c6ca64118b934ef85f66adb03d5bbfdd57201 (patch) | |
tree | 5eb9a50cce32cadd527d5fc92095c76c00b72bae | |
parent | da201161662b8ee9c8d7bd8cc50ce3cb3366d400 (diff) |
sparc64: Add SPARC-T4 optimized memcpy.
Before After
-------------- --------------
bw_tcp: 1288.53 MB/sec 1637.77 MB/sec
bw_pipe: 1517.18 MB/sec 2107.61 MB/sec
bw_unix: 1838.38 MB/sec 2640.91 MB/sec
make -s -j128
allmodconfig 5min 49sec 5min 31sec
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | arch/sparc/kernel/head_64.S | 14 | ||||
-rw-r--r-- | arch/sparc/lib/Makefile | 3 | ||||
-rw-r--r-- | arch/sparc/lib/NG4copy_from_user.S | 30 | ||||
-rw-r--r-- | arch/sparc/lib/NG4copy_page.S | 57 | ||||
-rw-r--r-- | arch/sparc/lib/NG4copy_to_user.S | 39 | ||||
-rw-r--r-- | arch/sparc/lib/NG4memcpy.S | 360 | ||||
-rw-r--r-- | arch/sparc/lib/NG4patch.S | 43 | ||||
-rw-r--r-- | arch/sparc/lib/NGpage.S | 2 |
8 files changed, 546 insertions, 2 deletions
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index b42ddbf9651e..ee5dcced2499 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S | |||
@@ -559,10 +559,10 @@ niagara_tlb_fixup: | |||
559 | be,pt %xcc, niagara2_patch | 559 | be,pt %xcc, niagara2_patch |
560 | nop | 560 | nop |
561 | cmp %g1, SUN4V_CHIP_NIAGARA4 | 561 | cmp %g1, SUN4V_CHIP_NIAGARA4 |
562 | be,pt %xcc, niagara2_patch | 562 | be,pt %xcc, niagara4_patch |
563 | nop | 563 | nop |
564 | cmp %g1, SUN4V_CHIP_NIAGARA5 | 564 | cmp %g1, SUN4V_CHIP_NIAGARA5 |
565 | be,pt %xcc, niagara2_patch | 565 | be,pt %xcc, niagara4_patch |
566 | nop | 566 | nop |
567 | 567 | ||
568 | call generic_patch_copyops | 568 | call generic_patch_copyops |
@@ -573,6 +573,16 @@ niagara_tlb_fixup: | |||
573 | nop | 573 | nop |
574 | 574 | ||
575 | ba,a,pt %xcc, 80f | 575 | ba,a,pt %xcc, 80f |
576 | niagara4_patch: | ||
577 | call niagara4_patch_copyops | ||
578 | nop | ||
579 | call niagara_patch_bzero | ||
580 | nop | ||
581 | call niagara4_patch_pageops | ||
582 | nop | ||
583 | |||
584 | ba,a,pt %xcc, 80f | ||
585 | |||
576 | niagara2_patch: | 586 | niagara2_patch: |
577 | call niagara2_patch_copyops | 587 | call niagara2_patch_copyops |
578 | nop | 588 | nop |
diff --git a/arch/sparc/lib/Makefile b/arch/sparc/lib/Makefile index dff4096f3dec..30f6ab51c551 100644 --- a/arch/sparc/lib/Makefile +++ b/arch/sparc/lib/Makefile | |||
@@ -32,6 +32,9 @@ lib-$(CONFIG_SPARC64) += NGpatch.o NGpage.o NGbzero.o | |||
32 | lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o | 32 | lib-$(CONFIG_SPARC64) += NG2memcpy.o NG2copy_from_user.o NG2copy_to_user.o |
33 | lib-$(CONFIG_SPARC64) += NG2patch.o | 33 | lib-$(CONFIG_SPARC64) += NG2patch.o |
34 | 34 | ||
35 | lib-$(CONFIG_SPARC64) += NG4memcpy.o NG4copy_from_user.o NG4copy_to_user.o | ||
36 | lib-$(CONFIG_SPARC64) += NG4patch.o NG4copy_page.o | ||
37 | |||
35 | lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o | 38 | lib-$(CONFIG_SPARC64) += GENmemcpy.o GENcopy_from_user.o GENcopy_to_user.o |
36 | lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o | 39 | lib-$(CONFIG_SPARC64) += GENpatch.o GENpage.o GENbzero.o |
37 | 40 | ||
diff --git a/arch/sparc/lib/NG4copy_from_user.S b/arch/sparc/lib/NG4copy_from_user.S new file mode 100644 index 000000000000..c8e98307ef59 --- /dev/null +++ b/arch/sparc/lib/NG4copy_from_user.S | |||
@@ -0,0 +1,30 @@ | |||
1 | /* NG4copy_from_user.S: Niagara-2 optimized copy from userspace. | ||
2 | * | ||
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_LD(x) \ | ||
7 | 98: x; \ | ||
8 | .section __ex_table,"a";\ | ||
9 | .align 4; \ | ||
10 | .word 98b, __retl_one_asi;\ | ||
11 | .text; \ | ||
12 | .align 4; | ||
13 | |||
14 | #ifndef ASI_AIUS | ||
15 | #define ASI_AIUS 0x11 | ||
16 | #endif | ||
17 | |||
18 | #define FUNC_NAME NG4copy_from_user | ||
19 | #define LOAD(type,addr,dest) type##a [addr] %asi, dest | ||
20 | #define EX_RETVAL(x) 0 | ||
21 | |||
22 | #ifdef __KERNEL__ | ||
23 | #define PREAMBLE \ | ||
24 | rd %asi, %g1; \ | ||
25 | cmp %g1, ASI_AIUS; \ | ||
26 | bne,pn %icc, ___copy_in_user; \ | ||
27 | nop | ||
28 | #endif | ||
29 | |||
30 | #include "NG4memcpy.S" | ||
diff --git a/arch/sparc/lib/NG4copy_page.S b/arch/sparc/lib/NG4copy_page.S new file mode 100644 index 000000000000..f30ec10bbcac --- /dev/null +++ b/arch/sparc/lib/NG4copy_page.S | |||
@@ -0,0 +1,57 @@ | |||
1 | /* NG4copy_page.S: Niagara-4 optimized copy page. | ||
2 | * | ||
3 | * Copyright (C) 2012 (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #include <asm/asi.h> | ||
7 | #include <asm/page.h> | ||
8 | |||
9 | .text | ||
10 | .align 32 | ||
11 | |||
12 | .register %g2, #scratch | ||
13 | .register %g3, #scratch | ||
14 | |||
15 | .globl NG4copy_user_page | ||
16 | NG4copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ | ||
17 | prefetch [%o1 + 0x000], #n_reads_strong | ||
18 | prefetch [%o1 + 0x040], #n_reads_strong | ||
19 | prefetch [%o1 + 0x080], #n_reads_strong | ||
20 | prefetch [%o1 + 0x0c0], #n_reads_strong | ||
21 | set PAGE_SIZE, %g7 | ||
22 | prefetch [%o1 + 0x100], #n_reads_strong | ||
23 | prefetch [%o1 + 0x140], #n_reads_strong | ||
24 | prefetch [%o1 + 0x180], #n_reads_strong | ||
25 | prefetch [%o1 + 0x1c0], #n_reads_strong | ||
26 | 1: | ||
27 | ldx [%o1 + 0x00], %o2 | ||
28 | subcc %g7, 0x40, %g7 | ||
29 | ldx [%o1 + 0x08], %o3 | ||
30 | ldx [%o1 + 0x10], %o4 | ||
31 | ldx [%o1 + 0x18], %o5 | ||
32 | ldx [%o1 + 0x20], %g1 | ||
33 | stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
34 | add %o0, 0x08, %o0 | ||
35 | ldx [%o1 + 0x28], %g2 | ||
36 | stxa %o3, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
37 | add %o0, 0x08, %o0 | ||
38 | ldx [%o1 + 0x30], %g3 | ||
39 | stxa %o4, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
40 | add %o0, 0x08, %o0 | ||
41 | ldx [%o1 + 0x38], %o2 | ||
42 | add %o1, 0x40, %o1 | ||
43 | stxa %o5, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
44 | add %o0, 0x08, %o0 | ||
45 | stxa %g1, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
46 | add %o0, 0x08, %o0 | ||
47 | stxa %g2, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
48 | add %o0, 0x08, %o0 | ||
49 | stxa %g3, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
50 | add %o0, 0x08, %o0 | ||
51 | stxa %o2, [%o0] ASI_BLK_INIT_QUAD_LDD_P | ||
52 | add %o0, 0x08, %o0 | ||
53 | bne,pt %icc, 1b | ||
54 | prefetch [%o1 + 0x200], #n_reads_strong | ||
55 | retl | ||
56 | membar #StoreLoad | #StoreStore | ||
57 | .size NG4copy_user_page,.-NG4copy_user_page | ||
diff --git a/arch/sparc/lib/NG4copy_to_user.S b/arch/sparc/lib/NG4copy_to_user.S new file mode 100644 index 000000000000..9744c4540a8d --- /dev/null +++ b/arch/sparc/lib/NG4copy_to_user.S | |||
@@ -0,0 +1,39 @@ | |||
1 | /* NG4copy_to_user.S: Niagara-4 optimized copy to userspace. | ||
2 | * | ||
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #define EX_ST(x) \ | ||
7 | 98: x; \ | ||
8 | .section __ex_table,"a";\ | ||
9 | .align 4; \ | ||
10 | .word 98b, __retl_one_asi;\ | ||
11 | .text; \ | ||
12 | .align 4; | ||
13 | |||
14 | #ifndef ASI_AIUS | ||
15 | #define ASI_AIUS 0x11 | ||
16 | #endif | ||
17 | |||
18 | #ifndef ASI_BLK_INIT_QUAD_LDD_AIUS | ||
19 | #define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23 | ||
20 | #endif | ||
21 | |||
22 | #define FUNC_NAME NG4copy_to_user | ||
23 | #define STORE(type,src,addr) type##a src, [addr] %asi | ||
24 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS | ||
25 | #define EX_RETVAL(x) 0 | ||
26 | |||
27 | #ifdef __KERNEL__ | ||
28 | /* Writing to %asi is _expensive_ so we hardcode it. | ||
29 | * Reading %asi to check for KERNEL_DS is comparatively | ||
30 | * cheap. | ||
31 | */ | ||
32 | #define PREAMBLE \ | ||
33 | rd %asi, %g1; \ | ||
34 | cmp %g1, ASI_AIUS; \ | ||
35 | bne,pn %icc, ___copy_in_user; \ | ||
36 | nop | ||
37 | #endif | ||
38 | |||
39 | #include "NG4memcpy.S" | ||
diff --git a/arch/sparc/lib/NG4memcpy.S b/arch/sparc/lib/NG4memcpy.S new file mode 100644 index 000000000000..6f4b526b3c4f --- /dev/null +++ b/arch/sparc/lib/NG4memcpy.S | |||
@@ -0,0 +1,360 @@ | |||
1 | /* NG4memcpy.S: Niagara-4 optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 2012 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #include <asm/visasm.h> | ||
8 | #include <asm/asi.h> | ||
9 | #define GLOBAL_SPARE %g7 | ||
10 | #else | ||
11 | #define ASI_BLK_INIT_QUAD_LDD_P 0xe2 | ||
12 | #define FPRS_FEF 0x04 | ||
13 | |||
14 | /* On T4 it is very expensive to access ASRs like %fprs and | ||
15 | * %asi, avoiding a read or a write can save ~50 cycles. | ||
16 | */ | ||
17 | #define FPU_ENTER \ | ||
18 | rd %fprs, %o5; \ | ||
19 | andcc %o5, FPRS_FEF, %g0; \ | ||
20 | be,a,pn %icc, 999f; \ | ||
21 | wr %g0, FPRS_FEF, %fprs; \ | ||
22 | 999: | ||
23 | |||
24 | #ifdef MEMCPY_DEBUG | ||
25 | #define VISEntryHalf FPU_ENTER; \ | ||
26 | clr %g1; clr %g2; clr %g3; clr %g5; subcc %g0, %g0, %g0; | ||
27 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
28 | #else | ||
29 | #define VISEntryHalf FPU_ENTER | ||
30 | #define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
31 | #endif | ||
32 | |||
33 | #define GLOBAL_SPARE %g5 | ||
34 | #endif | ||
35 | |||
36 | #ifndef STORE_ASI | ||
37 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | ||
38 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | ||
39 | #else | ||
40 | #define STORE_ASI 0x80 /* ASI_P */ | ||
41 | #endif | ||
42 | #endif | ||
43 | |||
44 | #ifndef EX_LD | ||
45 | #define EX_LD(x) x | ||
46 | #endif | ||
47 | |||
48 | #ifndef EX_ST | ||
49 | #define EX_ST(x) x | ||
50 | #endif | ||
51 | |||
52 | #ifndef EX_RETVAL | ||
53 | #define EX_RETVAL(x) x | ||
54 | #endif | ||
55 | |||
56 | #ifndef LOAD | ||
57 | #define LOAD(type,addr,dest) type [addr], dest | ||
58 | #endif | ||
59 | |||
60 | #ifndef STORE | ||
61 | #ifndef MEMCPY_DEBUG | ||
62 | #define STORE(type,src,addr) type src, [addr] | ||
63 | #else | ||
64 | #define STORE(type,src,addr) type##a src, [addr] %asi | ||
65 | #endif | ||
66 | #endif | ||
67 | |||
68 | #ifndef STORE_INIT | ||
69 | #define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI | ||
70 | #endif | ||
71 | |||
72 | #ifndef FUNC_NAME | ||
73 | #define FUNC_NAME NG4memcpy | ||
74 | #endif | ||
75 | #ifndef PREAMBLE | ||
76 | #define PREAMBLE | ||
77 | #endif | ||
78 | |||
79 | #ifndef XCC | ||
80 | #define XCC xcc | ||
81 | #endif | ||
82 | |||
83 | .register %g2,#scratch | ||
84 | .register %g3,#scratch | ||
85 | |||
86 | .text | ||
87 | .align 64 | ||
88 | |||
89 | .globl FUNC_NAME | ||
90 | .type FUNC_NAME,#function | ||
91 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | ||
92 | #ifdef MEMCPY_DEBUG | ||
93 | wr %g0, 0x80, %asi | ||
94 | #endif | ||
95 | srlx %o2, 31, %g2 | ||
96 | cmp %g2, 0 | ||
97 | tne %XCC, 5 | ||
98 | PREAMBLE | ||
99 | mov %o0, %o3 | ||
100 | brz,pn %o2, .Lexit | ||
101 | cmp %o2, 3 | ||
102 | ble,pn %icc, .Ltiny | ||
103 | cmp %o2, 19 | ||
104 | ble,pn %icc, .Lsmall | ||
105 | or %o0, %o1, %g2 | ||
106 | cmp %o2, 128 | ||
107 | bl,pn %icc, .Lmedium | ||
108 | nop | ||
109 | |||
110 | .Llarge:/* len >= 0x80 */ | ||
111 | /* First get dest 8 byte aligned. */ | ||
112 | sub %g0, %o0, %g1 | ||
113 | and %g1, 0x7, %g1 | ||
114 | brz,pt %g1, 51f | ||
115 | sub %o2, %g1, %o2 | ||
116 | |||
117 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) | ||
118 | add %o1, 1, %o1 | ||
119 | subcc %g1, 1, %g1 | ||
120 | add %o0, 1, %o0 | ||
121 | bne,pt %icc, 1b | ||
122 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | ||
123 | |||
124 | 51: LOAD(prefetch, %o1 + 0x040, #n_reads_strong) | ||
125 | LOAD(prefetch, %o1 + 0x080, #n_reads_strong) | ||
126 | LOAD(prefetch, %o1 + 0x0c0, #n_reads_strong) | ||
127 | LOAD(prefetch, %o1 + 0x100, #n_reads_strong) | ||
128 | LOAD(prefetch, %o1 + 0x140, #n_reads_strong) | ||
129 | LOAD(prefetch, %o1 + 0x180, #n_reads_strong) | ||
130 | LOAD(prefetch, %o1 + 0x1c0, #n_reads_strong) | ||
131 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | ||
132 | |||
133 | /* Check if we can use the straight fully aligned | ||
134 | * loop, or we require the alignaddr/faligndata variant. | ||
135 | */ | ||
136 | andcc %o1, 0x7, %o5 | ||
137 | bne,pn %icc, .Llarge_src_unaligned | ||
138 | sub %g0, %o0, %g1 | ||
139 | |||
140 | /* Legitimize the use of initializing stores by getting dest | ||
141 | * to be 64-byte aligned. | ||
142 | */ | ||
143 | and %g1, 0x3f, %g1 | ||
144 | brz,pt %g1, .Llarge_aligned | ||
145 | sub %o2, %g1, %o2 | ||
146 | |||
147 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g2)) | ||
148 | add %o1, 8, %o1 | ||
149 | subcc %g1, 8, %g1 | ||
150 | add %o0, 8, %o0 | ||
151 | bne,pt %icc, 1b | ||
152 | EX_ST(STORE(stx, %g2, %o0 - 0x08)) | ||
153 | |||
154 | .Llarge_aligned: | ||
155 | /* len >= 0x80 && src 8-byte aligned && dest 8-byte aligned */ | ||
156 | andn %o2, 0x3f, %o4 | ||
157 | sub %o2, %o4, %o2 | ||
158 | |||
159 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | ||
160 | add %o1, 0x40, %o1 | ||
161 | EX_LD(LOAD(ldx, %o1 - 0x38, %g2)) | ||
162 | subcc %o4, 0x40, %o4 | ||
163 | EX_LD(LOAD(ldx, %o1 - 0x30, %g3)) | ||
164 | EX_LD(LOAD(ldx, %o1 - 0x28, GLOBAL_SPARE)) | ||
165 | EX_LD(LOAD(ldx, %o1 - 0x20, %o5)) | ||
166 | EX_ST(STORE_INIT(%g1, %o0)) | ||
167 | add %o0, 0x08, %o0 | ||
168 | EX_ST(STORE_INIT(%g2, %o0)) | ||
169 | add %o0, 0x08, %o0 | ||
170 | EX_LD(LOAD(ldx, %o1 - 0x18, %g2)) | ||
171 | EX_ST(STORE_INIT(%g3, %o0)) | ||
172 | add %o0, 0x08, %o0 | ||
173 | EX_LD(LOAD(ldx, %o1 - 0x10, %g3)) | ||
174 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | ||
175 | add %o0, 0x08, %o0 | ||
176 | EX_LD(LOAD(ldx, %o1 - 0x08, GLOBAL_SPARE)) | ||
177 | EX_ST(STORE_INIT(%o5, %o0)) | ||
178 | add %o0, 0x08, %o0 | ||
179 | EX_ST(STORE_INIT(%g2, %o0)) | ||
180 | add %o0, 0x08, %o0 | ||
181 | EX_ST(STORE_INIT(%g3, %o0)) | ||
182 | add %o0, 0x08, %o0 | ||
183 | EX_ST(STORE_INIT(GLOBAL_SPARE, %o0)) | ||
184 | add %o0, 0x08, %o0 | ||
185 | bne,pt %icc, 1b | ||
186 | LOAD(prefetch, %o1 + 0x200, #n_reads_strong) | ||
187 | |||
188 | membar #StoreLoad | #StoreStore | ||
189 | |||
190 | brz,pn %o2, .Lexit | ||
191 | cmp %o2, 19 | ||
192 | ble,pn %icc, .Lsmall_unaligned | ||
193 | nop | ||
194 | ba,a,pt %icc, .Lmedium_noprefetch | ||
195 | |||
196 | .Lexit: retl | ||
197 | mov EX_RETVAL(%o3), %o0 | ||
198 | |||
199 | .Llarge_src_unaligned: | ||
200 | andn %o2, 0x3f, %o4 | ||
201 | sub %o2, %o4, %o2 | ||
202 | VISEntryHalf | ||
203 | alignaddr %o1, %g0, %g1 | ||
204 | add %o1, %o4, %o1 | ||
205 | EX_LD(LOAD(ldd, %g1 + 0x00, %f0)) | ||
206 | 1: EX_LD(LOAD(ldd, %g1 + 0x08, %f2)) | ||
207 | subcc %o4, 0x40, %o4 | ||
208 | EX_LD(LOAD(ldd, %g1 + 0x10, %f4)) | ||
209 | EX_LD(LOAD(ldd, %g1 + 0x18, %f6)) | ||
210 | EX_LD(LOAD(ldd, %g1 + 0x20, %f8)) | ||
211 | EX_LD(LOAD(ldd, %g1 + 0x28, %f10)) | ||
212 | EX_LD(LOAD(ldd, %g1 + 0x30, %f12)) | ||
213 | EX_LD(LOAD(ldd, %g1 + 0x38, %f14)) | ||
214 | faligndata %f0, %f2, %f16 | ||
215 | EX_LD(LOAD(ldd, %g1 + 0x40, %f0)) | ||
216 | faligndata %f2, %f4, %f18 | ||
217 | add %g1, 0x40, %g1 | ||
218 | faligndata %f4, %f6, %f20 | ||
219 | faligndata %f6, %f8, %f22 | ||
220 | faligndata %f8, %f10, %f24 | ||
221 | faligndata %f10, %f12, %f26 | ||
222 | faligndata %f12, %f14, %f28 | ||
223 | faligndata %f14, %f0, %f30 | ||
224 | EX_ST(STORE(std, %f16, %o0 + 0x00)) | ||
225 | EX_ST(STORE(std, %f18, %o0 + 0x08)) | ||
226 | EX_ST(STORE(std, %f20, %o0 + 0x10)) | ||
227 | EX_ST(STORE(std, %f22, %o0 + 0x18)) | ||
228 | EX_ST(STORE(std, %f24, %o0 + 0x20)) | ||
229 | EX_ST(STORE(std, %f26, %o0 + 0x28)) | ||
230 | EX_ST(STORE(std, %f28, %o0 + 0x30)) | ||
231 | EX_ST(STORE(std, %f30, %o0 + 0x38)) | ||
232 | add %o0, 0x40, %o0 | ||
233 | bne,pt %icc, 1b | ||
234 | LOAD(prefetch, %g1 + 0x200, #n_reads_strong) | ||
235 | VISExitHalf | ||
236 | |||
237 | brz,pn %o2, .Lexit | ||
238 | cmp %o2, 19 | ||
239 | ble,pn %icc, .Lsmall_unaligned | ||
240 | nop | ||
241 | ba,a,pt %icc, .Lmedium_unaligned | ||
242 | |||
243 | .Lmedium: | ||
244 | LOAD(prefetch, %o1 + 0x40, #n_reads_strong) | ||
245 | andcc %g2, 0x7, %g0 | ||
246 | bne,pn %icc, .Lmedium_unaligned | ||
247 | nop | ||
248 | .Lmedium_noprefetch: | ||
249 | andncc %o2, 0x20 - 1, %o5 | ||
250 | be,pn %icc, 2f | ||
251 | sub %o2, %o5, %o2 | ||
252 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | ||
253 | EX_LD(LOAD(ldx, %o1 + 0x08, %g2)) | ||
254 | EX_LD(LOAD(ldx, %o1 + 0x10, GLOBAL_SPARE)) | ||
255 | EX_LD(LOAD(ldx, %o1 + 0x18, %o4)) | ||
256 | add %o1, 0x20, %o1 | ||
257 | subcc %o5, 0x20, %o5 | ||
258 | EX_ST(STORE(stx, %g1, %o0 + 0x00)) | ||
259 | EX_ST(STORE(stx, %g2, %o0 + 0x08)) | ||
260 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x10)) | ||
261 | EX_ST(STORE(stx, %o4, %o0 + 0x18)) | ||
262 | bne,pt %icc, 1b | ||
263 | add %o0, 0x20, %o0 | ||
264 | 2: andcc %o2, 0x18, %o5 | ||
265 | be,pt %icc, 3f | ||
266 | sub %o2, %o5, %o2 | ||
267 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %g1)) | ||
268 | add %o1, 0x08, %o1 | ||
269 | add %o0, 0x08, %o0 | ||
270 | subcc %o5, 0x08, %o5 | ||
271 | bne,pt %icc, 1b | ||
272 | EX_ST(STORE(stx, %g1, %o0 - 0x08)) | ||
273 | 3: brz,pt %o2, .Lexit | ||
274 | cmp %o2, 0x04 | ||
275 | bl,pn %icc, .Ltiny | ||
276 | nop | ||
277 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | ||
278 | add %o1, 0x04, %o1 | ||
279 | add %o0, 0x04, %o0 | ||
280 | subcc %o2, 0x04, %o2 | ||
281 | bne,pn %icc, .Ltiny | ||
282 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | ||
283 | ba,a,pt %icc, .Lexit | ||
284 | .Lmedium_unaligned: | ||
285 | /* First get dest 8 byte aligned. */ | ||
286 | sub %g0, %o0, %g1 | ||
287 | and %g1, 0x7, %g1 | ||
288 | brz,pt %g1, 2f | ||
289 | sub %o2, %g1, %o2 | ||
290 | |||
291 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g2)) | ||
292 | add %o1, 1, %o1 | ||
293 | subcc %g1, 1, %g1 | ||
294 | add %o0, 1, %o0 | ||
295 | bne,pt %icc, 1b | ||
296 | EX_ST(STORE(stb, %g2, %o0 - 0x01)) | ||
297 | 2: | ||
298 | and %o1, 0x7, %g1 | ||
299 | brz,pn %g1, .Lmedium_noprefetch | ||
300 | sll %g1, 3, %g1 | ||
301 | mov 64, %g2 | ||
302 | sub %g2, %g1, %g2 | ||
303 | andn %o1, 0x7, %o1 | ||
304 | EX_LD(LOAD(ldx, %o1 + 0x00, %o4)) | ||
305 | sllx %o4, %g1, %o4 | ||
306 | andn %o2, 0x08 - 1, %o5 | ||
307 | sub %o2, %o5, %o2 | ||
308 | 1: EX_LD(LOAD(ldx, %o1 + 0x08, %g3)) | ||
309 | add %o1, 0x08, %o1 | ||
310 | subcc %o5, 0x08, %o5 | ||
311 | srlx %g3, %g2, GLOBAL_SPARE | ||
312 | or GLOBAL_SPARE, %o4, GLOBAL_SPARE | ||
313 | EX_ST(STORE(stx, GLOBAL_SPARE, %o0 + 0x00)) | ||
314 | add %o0, 0x08, %o0 | ||
315 | bne,pt %icc, 1b | ||
316 | sllx %g3, %g1, %o4 | ||
317 | srl %g1, 3, %g1 | ||
318 | add %o1, %g1, %o1 | ||
319 | brz,pn %o2, .Lexit | ||
320 | nop | ||
321 | ba,pt %icc, .Lsmall_unaligned | ||
322 | |||
323 | .Ltiny: | ||
324 | EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | ||
325 | subcc %o2, 1, %o2 | ||
326 | be,pn %icc, .Lexit | ||
327 | EX_ST(STORE(stb, %g1, %o0 + 0x00)) | ||
328 | EX_LD(LOAD(ldub, %o1 + 0x01, %g1)) | ||
329 | subcc %o2, 1, %o2 | ||
330 | be,pn %icc, .Lexit | ||
331 | EX_ST(STORE(stb, %g1, %o0 + 0x01)) | ||
332 | EX_LD(LOAD(ldub, %o1 + 0x02, %g1)) | ||
333 | ba,pt %icc, .Lexit | ||
334 | EX_ST(STORE(stb, %g1, %o0 + 0x02)) | ||
335 | |||
336 | .Lsmall: | ||
337 | andcc %g2, 0x3, %g0 | ||
338 | bne,pn %icc, .Lsmall_unaligned | ||
339 | andn %o2, 0x4 - 1, %o5 | ||
340 | sub %o2, %o5, %o2 | ||
341 | 1: | ||
342 | EX_LD(LOAD(lduw, %o1 + 0x00, %g1)) | ||
343 | add %o1, 0x04, %o1 | ||
344 | subcc %o5, 0x04, %o5 | ||
345 | add %o0, 0x04, %o0 | ||
346 | bne,pt %icc, 1b | ||
347 | EX_ST(STORE(stw, %g1, %o0 - 0x04)) | ||
348 | brz,pt %o2, .Lexit | ||
349 | nop | ||
350 | ba,a,pt %icc, .Ltiny | ||
351 | |||
352 | .Lsmall_unaligned: | ||
353 | 1: EX_LD(LOAD(ldub, %o1 + 0x00, %g1)) | ||
354 | add %o1, 1, %o1 | ||
355 | add %o0, 1, %o0 | ||
356 | subcc %o2, 1, %o2 | ||
357 | bne,pt %icc, 1b | ||
358 | EX_ST(STORE(stb, %g1, %o0 - 0x01)) | ||
359 | ba,a,pt %icc, .Lexit | ||
360 | .size FUNC_NAME, .-FUNC_NAME | ||
diff --git a/arch/sparc/lib/NG4patch.S b/arch/sparc/lib/NG4patch.S new file mode 100644 index 000000000000..c21c34c61dda --- /dev/null +++ b/arch/sparc/lib/NG4patch.S | |||
@@ -0,0 +1,43 @@ | |||
1 | /* NG4patch.S: Patch Ultra-I routines with Niagara-4 variant. | ||
2 | * | ||
3 | * Copyright (C) 2012 David S. Miller <davem@davemloft.net> | ||
4 | */ | ||
5 | |||
6 | #define BRANCH_ALWAYS 0x10680000 | ||
7 | #define NOP 0x01000000 | ||
8 | #define NG_DO_PATCH(OLD, NEW) \ | ||
9 | sethi %hi(NEW), %g1; \ | ||
10 | or %g1, %lo(NEW), %g1; \ | ||
11 | sethi %hi(OLD), %g2; \ | ||
12 | or %g2, %lo(OLD), %g2; \ | ||
13 | sub %g1, %g2, %g1; \ | ||
14 | sethi %hi(BRANCH_ALWAYS), %g3; \ | ||
15 | sll %g1, 11, %g1; \ | ||
16 | srl %g1, 11 + 2, %g1; \ | ||
17 | or %g3, %lo(BRANCH_ALWAYS), %g3; \ | ||
18 | or %g3, %g1, %g3; \ | ||
19 | stw %g3, [%g2]; \ | ||
20 | sethi %hi(NOP), %g3; \ | ||
21 | or %g3, %lo(NOP), %g3; \ | ||
22 | stw %g3, [%g2 + 0x4]; \ | ||
23 | flush %g2; | ||
24 | |||
25 | .globl niagara4_patch_copyops | ||
26 | .type niagara4_patch_copyops,#function | ||
27 | niagara4_patch_copyops: | ||
28 | NG_DO_PATCH(memcpy, NG4memcpy) | ||
29 | NG_DO_PATCH(___copy_from_user, NG4copy_from_user) | ||
30 | NG_DO_PATCH(___copy_to_user, NG4copy_to_user) | ||
31 | retl | ||
32 | nop | ||
33 | .size niagara4_patch_copyops,.-niagara4_patch_copyops | ||
34 | |||
35 | .globl niagara4_patch_pageops | ||
36 | .type niagara4_patch_pageops,#function | ||
37 | niagara4_patch_pageops: | ||
38 | NG_DO_PATCH(copy_user_page, NG4copy_user_page) | ||
39 | NG_DO_PATCH(_clear_page, NGclear_page) | ||
40 | NG_DO_PATCH(clear_user_page, NGclear_user_page) | ||
41 | retl | ||
42 | nop | ||
43 | .size niagara4_patch_pageops,.-niagara4_patch_pageops | ||
diff --git a/arch/sparc/lib/NGpage.S b/arch/sparc/lib/NGpage.S index b9e790b9c6b8..423d46e2258b 100644 --- a/arch/sparc/lib/NGpage.S +++ b/arch/sparc/lib/NGpage.S | |||
@@ -59,6 +59,8 @@ NGcopy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */ | |||
59 | restore | 59 | restore |
60 | 60 | ||
61 | .align 32 | 61 | .align 32 |
62 | .globl NGclear_page | ||
63 | .globl NGclear_user_page | ||
62 | NGclear_page: /* %o0=dest */ | 64 | NGclear_page: /* %o0=dest */ |
63 | NGclear_user_page: /* %o0=dest, %o1=vaddr */ | 65 | NGclear_user_page: /* %o0=dest, %o1=vaddr */ |
64 | rd %asi, %g3 | 66 | rd %asi, %g3 |