aboutsummaryrefslogtreecommitdiffstats
path: root/arch/sparc64/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sparc64/lib')
-rw-r--r--arch/sparc64/lib/Makefile2
-rw-r--r--arch/sparc64/lib/NGbzero.S163
-rw-r--r--arch/sparc64/lib/NGcopy_from_user.S37
-rw-r--r--arch/sparc64/lib/NGcopy_to_user.S40
-rw-r--r--arch/sparc64/lib/NGmemcpy.S368
-rw-r--r--arch/sparc64/lib/NGpage.S96
-rw-r--r--arch/sparc64/lib/NGpatch.S33
-rw-r--r--arch/sparc64/lib/U3patch.S3
-rw-r--r--arch/sparc64/lib/bzero.S18
-rw-r--r--arch/sparc64/lib/clear_page.S12
-rw-r--r--arch/sparc64/lib/copy_page.S7
-rw-r--r--arch/sparc64/lib/delay.c19
-rw-r--r--arch/sparc64/lib/xor.S300
13 files changed, 1064 insertions, 34 deletions
diff --git a/arch/sparc64/lib/Makefile b/arch/sparc64/lib/Makefile
index c295806500f7..8812ded19f01 100644
--- a/arch/sparc64/lib/Makefile
+++ b/arch/sparc64/lib/Makefile
@@ -11,6 +11,8 @@ lib-y := PeeCeeI.o copy_page.o clear_page.o strlen.o strncmp.o \
11 VISsave.o atomic.o bitops.o \ 11 VISsave.o atomic.o bitops.o \
12 U1memcpy.o U1copy_from_user.o U1copy_to_user.o \ 12 U1memcpy.o U1copy_from_user.o U1copy_to_user.o \
13 U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \ 13 U3memcpy.o U3copy_from_user.o U3copy_to_user.o U3patch.o \
14 NGmemcpy.o NGcopy_from_user.o NGcopy_to_user.o NGpatch.o \
15 NGpage.o NGbzero.o \
14 copy_in_user.o user_fixup.o memmove.o \ 16 copy_in_user.o user_fixup.o memmove.o \
15 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o 17 mcount.o ipcsum.o rwsem.o xor.o find_bit.o delay.o
16 18
diff --git a/arch/sparc64/lib/NGbzero.S b/arch/sparc64/lib/NGbzero.S
new file mode 100644
index 000000000000..e86baece5cc8
--- /dev/null
+++ b/arch/sparc64/lib/NGbzero.S
@@ -0,0 +1,163 @@
1/* NGbzero.S: Niagara optimized memset/clear_user.
2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
4 */
5#include <asm/asi.h>
6
7#define EX_ST(x,y) \
898: x,y; \
9 .section .fixup; \
10 .align 4; \
1199: retl; \
12 mov %o1, %o0; \
13 .section __ex_table; \
14 .align 4; \
15 .word 98b, 99b; \
16 .text; \
17 .align 4;
18
19 .text
20
21 .globl NGmemset
22 .type NGmemset, #function
23NGmemset: /* %o0=buf, %o1=pat, %o2=len */
24 and %o1, 0xff, %o3
25 mov %o2, %o1
26 sllx %o3, 8, %g1
27 or %g1, %o3, %o2
28 sllx %o2, 16, %g1
29 or %g1, %o2, %o2
30 sllx %o2, 32, %g1
31 ba,pt %xcc, 1f
32 or %g1, %o2, %o2
33
34 .globl NGbzero
35 .type NGbzero, #function
36NGbzero:
37 clr %o2
381: brz,pn %o1, NGbzero_return
39 mov %o0, %o3
40
41 /* %o5: saved %asi, restored at NGbzero_done
42 * %g7: store-init %asi to use
43 * %o4: non-store-init %asi to use
44 */
45 rd %asi, %o5
46 mov ASI_BLK_INIT_QUAD_LDD_P, %g7
47 mov ASI_P, %o4
48 wr %o4, 0x0, %asi
49
50NGbzero_from_clear_user:
51 cmp %o1, 15
52 bl,pn %icc, NGbzero_tiny
53 andcc %o0, 0x7, %g1
54 be,pt %xcc, 2f
55 mov 8, %g2
56 sub %g2, %g1, %g1
57 sub %o1, %g1, %o1
581: EX_ST(stba %o2, [%o0 + 0x00] %asi)
59 subcc %g1, 1, %g1
60 bne,pt %xcc, 1b
61 add %o0, 1, %o0
622: cmp %o1, 128
63 bl,pn %icc, NGbzero_medium
64 andcc %o0, (64 - 1), %g1
65 be,pt %xcc, NGbzero_pre_loop
66 mov 64, %g2
67 sub %g2, %g1, %g1
68 sub %o1, %g1, %o1
691: EX_ST(stxa %o2, [%o0 + 0x00] %asi)
70 subcc %g1, 8, %g1
71 bne,pt %xcc, 1b
72 add %o0, 8, %o0
73
74NGbzero_pre_loop:
75 wr %g7, 0x0, %asi
76 andn %o1, (64 - 1), %g1
77 sub %o1, %g1, %o1
78NGbzero_loop:
79 EX_ST(stxa %o2, [%o0 + 0x00] %asi)
80 EX_ST(stxa %o2, [%o0 + 0x08] %asi)
81 EX_ST(stxa %o2, [%o0 + 0x10] %asi)
82 EX_ST(stxa %o2, [%o0 + 0x18] %asi)
83 EX_ST(stxa %o2, [%o0 + 0x20] %asi)
84 EX_ST(stxa %o2, [%o0 + 0x28] %asi)
85 EX_ST(stxa %o2, [%o0 + 0x30] %asi)
86 EX_ST(stxa %o2, [%o0 + 0x38] %asi)
87 subcc %g1, 64, %g1
88 bne,pt %xcc, NGbzero_loop
89 add %o0, 64, %o0
90
91 wr %o4, 0x0, %asi
92 brz,pn %o1, NGbzero_done
93NGbzero_medium:
94 andncc %o1, 0x7, %g1
95 be,pn %xcc, 2f
96 sub %o1, %g1, %o1
971: EX_ST(stxa %o2, [%o0 + 0x00] %asi)
98 subcc %g1, 8, %g1
99 bne,pt %xcc, 1b
100 add %o0, 8, %o0
1012: brz,pt %o1, NGbzero_done
102 nop
103
104NGbzero_tiny:
1051: EX_ST(stba %o2, [%o0 + 0x00] %asi)
106 subcc %o1, 1, %o1
107 bne,pt %icc, 1b
108 add %o0, 1, %o0
109
110 /* fallthrough */
111
112NGbzero_done:
113 wr %o5, 0x0, %asi
114
115NGbzero_return:
116 retl
117 mov %o3, %o0
118 .size NGbzero, .-NGbzero
119 .size NGmemset, .-NGmemset
120
121 .globl NGclear_user
122 .type NGclear_user, #function
123NGclear_user: /* %o0=buf, %o1=len */
124 rd %asi, %o5
125 brz,pn %o1, NGbzero_done
126 clr %o3
127 cmp %o5, ASI_AIUS
128 bne,pn %icc, NGbzero
129 clr %o2
130 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %g7
131 ba,pt %xcc, NGbzero_from_clear_user
132 mov ASI_AIUS, %o4
133 .size NGclear_user, .-NGclear_user
134
135#define BRANCH_ALWAYS 0x10680000
136#define NOP 0x01000000
137#define NG_DO_PATCH(OLD, NEW) \
138 sethi %hi(NEW), %g1; \
139 or %g1, %lo(NEW), %g1; \
140 sethi %hi(OLD), %g2; \
141 or %g2, %lo(OLD), %g2; \
142 sub %g1, %g2, %g1; \
143 sethi %hi(BRANCH_ALWAYS), %g3; \
144 sll %g1, 11, %g1; \
145 srl %g1, 11 + 2, %g1; \
146 or %g3, %lo(BRANCH_ALWAYS), %g3; \
147 or %g3, %g1, %g3; \
148 stw %g3, [%g2]; \
149 sethi %hi(NOP), %g3; \
150 or %g3, %lo(NOP), %g3; \
151 stw %g3, [%g2 + 0x4]; \
152 flush %g2;
153
154 .globl niagara_patch_bzero
155 .type niagara_patch_bzero,#function
156niagara_patch_bzero:
157 NG_DO_PATCH(memset, NGmemset)
158 NG_DO_PATCH(__bzero, NGbzero)
159 NG_DO_PATCH(__clear_user, NGclear_user)
160 NG_DO_PATCH(tsb_init, NGtsb_init)
161 retl
162 nop
163 .size niagara_patch_bzero,.-niagara_patch_bzero
diff --git a/arch/sparc64/lib/NGcopy_from_user.S b/arch/sparc64/lib/NGcopy_from_user.S
new file mode 100644
index 000000000000..2d93456f76dd
--- /dev/null
+++ b/arch/sparc64/lib/NGcopy_from_user.S
@@ -0,0 +1,37 @@
1/* NGcopy_from_user.S: Niagara optimized copy from userspace.
2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
4 */
5
6#define EX_LD(x) \
798: x; \
8 .section .fixup; \
9 .align 4; \
1099: wr %g0, ASI_AIUS, %asi;\
11 retl; \
12 mov 1, %o0; \
13 .section __ex_table,"a";\
14 .align 4; \
15 .word 98b, 99b; \
16 .text; \
17 .align 4;
18
19#ifndef ASI_AIUS
20#define ASI_AIUS 0x11
21#endif
22
23#define FUNC_NAME NGcopy_from_user
24#define LOAD(type,addr,dest) type##a [addr] ASI_AIUS, dest
25#define LOAD_TWIN(addr_reg,dest0,dest1) \
26 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_AIUS, dest0
27#define EX_RETVAL(x) 0
28
29#ifdef __KERNEL__
30#define PREAMBLE \
31 rd %asi, %g1; \
32 cmp %g1, ASI_AIUS; \
33 bne,pn %icc, memcpy_user_stub; \
34 nop
35#endif
36
37#include "NGmemcpy.S"
diff --git a/arch/sparc64/lib/NGcopy_to_user.S b/arch/sparc64/lib/NGcopy_to_user.S
new file mode 100644
index 000000000000..34112d5054ef
--- /dev/null
+++ b/arch/sparc64/lib/NGcopy_to_user.S
@@ -0,0 +1,40 @@
1/* NGcopy_to_user.S: Niagara optimized copy to userspace.
2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
4 */
5
6#define EX_ST(x) \
798: x; \
8 .section .fixup; \
9 .align 4; \
1099: wr %g0, ASI_AIUS, %asi;\
11 retl; \
12 mov 1, %o0; \
13 .section __ex_table,"a";\
14 .align 4; \
15 .word 98b, 99b; \
16 .text; \
17 .align 4;
18
19#ifndef ASI_AIUS
20#define ASI_AIUS 0x11
21#endif
22
23#define FUNC_NAME NGcopy_to_user
24#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS
25#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS
26#define EX_RETVAL(x) 0
27
28#ifdef __KERNEL__
29 /* Writing to %asi is _expensive_ so we hardcode it.
30 * Reading %asi to check for KERNEL_DS is comparatively
31 * cheap.
32 */
33#define PREAMBLE \
34 rd %asi, %g1; \
35 cmp %g1, ASI_AIUS; \
36 bne,pn %icc, memcpy_user_stub; \
37 nop
38#endif
39
40#include "NGmemcpy.S"
diff --git a/arch/sparc64/lib/NGmemcpy.S b/arch/sparc64/lib/NGmemcpy.S
new file mode 100644
index 000000000000..8e522b3dc095
--- /dev/null
+++ b/arch/sparc64/lib/NGmemcpy.S
@@ -0,0 +1,368 @@
1/* NGmemcpy.S: Niagara optimized memcpy.
2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net)
4 */
5
6#ifdef __KERNEL__
7#include <asm/asi.h>
8#include <asm/thread_info.h>
9#define GLOBAL_SPARE %g7
10#define RESTORE_ASI(TMP) \
11 ldub [%g6 + TI_CURRENT_DS], TMP; \
12 wr TMP, 0x0, %asi;
13#else
14#define GLOBAL_SPARE %g5
15#define RESTORE_ASI(TMP) \
16 wr %g0, ASI_PNF, %asi
17#endif
18
19#ifndef STORE_ASI
20#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
21#endif
22
23#ifndef EX_LD
24#define EX_LD(x) x
25#endif
26
27#ifndef EX_ST
28#define EX_ST(x) x
29#endif
30
31#ifndef EX_RETVAL
32#define EX_RETVAL(x) x
33#endif
34
35#ifndef LOAD
36#ifndef MEMCPY_DEBUG
37#define LOAD(type,addr,dest) type [addr], dest
38#else
39#define LOAD(type,addr,dest) type##a [addr] 0x80, dest
40#endif
41#endif
42
43#ifndef LOAD_TWIN
44#define LOAD_TWIN(addr_reg,dest0,dest1) \
45 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0
46#endif
47
48#ifndef STORE
49#define STORE(type,src,addr) type src, [addr]
50#endif
51
52#ifndef STORE_INIT
53#define STORE_INIT(src,addr) stxa src, [addr] %asi
54#endif
55
56#ifndef FUNC_NAME
57#define FUNC_NAME NGmemcpy
58#endif
59
60#ifndef PREAMBLE
61#define PREAMBLE
62#endif
63
64#ifndef XCC
65#define XCC xcc
66#endif
67
68 .register %g2,#scratch
69 .register %g3,#scratch
70
71 .text
72 .align 64
73
74 .globl FUNC_NAME
75 .type FUNC_NAME,#function
76FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
77 srlx %o2, 31, %g2
78 cmp %g2, 0
79 tne %xcc, 5
80 PREAMBLE
81 mov %o0, GLOBAL_SPARE
82 cmp %o2, 0
83 be,pn %XCC, 85f
84 or %o0, %o1, %o3
85 cmp %o2, 16
86 blu,a,pn %XCC, 80f
87 or %o3, %o2, %o3
88
89 /* 2 blocks (128 bytes) is the minimum we can do the block
90 * copy with. We need to ensure that we'll iterate at least
91 * once in the block copy loop. At worst we'll need to align
92 * the destination to a 64-byte boundary which can chew up
93 * to (64 - 1) bytes from the length before we perform the
94 * block copy loop.
95 */
96 cmp %o2, (2 * 64)
97 blu,pt %XCC, 70f
98 andcc %o3, 0x7, %g0
99
100 /* %o0: dst
101 * %o1: src
102 * %o2: len (known to be >= 128)
103 *
104 * The block copy loops will use %o4/%o5,%g2/%g3 as
105 * temporaries while copying the data.
106 */
107
108 LOAD(prefetch, %o1, #one_read)
109 wr %g0, STORE_ASI, %asi
110
111 /* Align destination on 64-byte boundary. */
112 andcc %o0, (64 - 1), %o4
113 be,pt %XCC, 2f
114 sub %o4, 64, %o4
115 sub %g0, %o4, %o4 ! bytes to align dst
116 sub %o2, %o4, %o2
1171: subcc %o4, 1, %o4
118 EX_LD(LOAD(ldub, %o1, %g1))
119 EX_ST(STORE(stb, %g1, %o0))
120 add %o1, 1, %o1
121 bne,pt %XCC, 1b
122 add %o0, 1, %o0
123
124 /* If the source is on a 16-byte boundary we can do
125 * the direct block copy loop. If it is 8-byte aligned
126 * we can do the 16-byte loads offset by -8 bytes and the
127 * init stores offset by one register.
128 *
129 * If the source is not even 8-byte aligned, we need to do
130 * shifting and masking (basically integer faligndata).
131 *
132 * The careful bit with init stores is that if we store
133 * to any part of the cache line we have to store the whole
134 * cacheline else we can end up with corrupt L2 cache line
135 * contents. Since the loop works on 64-bytes of 64-byte
136 * aligned store data at a time, this is easy to ensure.
137 */
1382:
139 andcc %o1, (16 - 1), %o4
140 andn %o2, (64 - 1), %g1 ! block copy loop iterator
141 sub %o2, %g1, %o2 ! final sub-block copy bytes
142 be,pt %XCC, 50f
143 cmp %o4, 8
144 be,a,pt %XCC, 10f
145 sub %o1, 0x8, %o1
146
147 /* Neither 8-byte nor 16-byte aligned, shift and mask. */
148 mov %g1, %o4
149 and %o1, 0x7, %g1
150 sll %g1, 3, %g1
151 mov 64, %o3
152 andn %o1, 0x7, %o1
153 EX_LD(LOAD(ldx, %o1, %g2))
154 sub %o3, %g1, %o3
155 sllx %g2, %g1, %g2
156
157#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\
158 EX_LD(LOAD(ldx, SRC, TMP1)); \
159 srlx TMP1, PRE_SHIFT, TMP2; \
160 or TMP2, PRE_VAL, TMP2; \
161 EX_ST(STORE_INIT(TMP2, DST)); \
162 sllx TMP1, POST_SHIFT, PRE_VAL;
163
1641: add %o1, 0x8, %o1
165 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00)
166 add %o1, 0x8, %o1
167 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08)
168 add %o1, 0x8, %o1
169 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10)
170 add %o1, 0x8, %o1
171 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18)
172 add %o1, 32, %o1
173 LOAD(prefetch, %o1, #one_read)
174 sub %o1, 32 - 8, %o1
175 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20)
176 add %o1, 8, %o1
177 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28)
178 add %o1, 8, %o1
179 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30)
180 add %o1, 8, %o1
181 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38)
182 subcc %o4, 64, %o4
183 bne,pt %XCC, 1b
184 add %o0, 64, %o0
185
186#undef SWIVEL_ONE_DWORD
187
188 srl %g1, 3, %g1
189 ba,pt %XCC, 60f
190 add %o1, %g1, %o1
191
19210: /* Destination is 64-byte aligned, source was only 8-byte
193 * aligned but it has been subtracted by 8 and we perform
194 * one twin load ahead, then add 8 back into source when
195 * we finish the loop.
196 */
197 EX_LD(LOAD_TWIN(%o1, %o4, %o5))
1981: add %o1, 16, %o1
199 EX_LD(LOAD_TWIN(%o1, %g2, %g3))
200 add %o1, 16 + 32, %o1
201 LOAD(prefetch, %o1, #one_read)
202 sub %o1, 32, %o1
203 EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line
204 EX_ST(STORE_INIT(%g2, %o0 + 0x08))
205 EX_LD(LOAD_TWIN(%o1, %o4, %o5))
206 add %o1, 16, %o1
207 EX_ST(STORE_INIT(%g3, %o0 + 0x10))
208 EX_ST(STORE_INIT(%o4, %o0 + 0x18))
209 EX_LD(LOAD_TWIN(%o1, %g2, %g3))
210 add %o1, 16, %o1
211 EX_ST(STORE_INIT(%o5, %o0 + 0x20))
212 EX_ST(STORE_INIT(%g2, %o0 + 0x28))
213 EX_LD(LOAD_TWIN(%o1, %o4, %o5))
214 EX_ST(STORE_INIT(%g3, %o0 + 0x30))
215 EX_ST(STORE_INIT(%o4, %o0 + 0x38))
216 subcc %g1, 64, %g1
217 bne,pt %XCC, 1b
218 add %o0, 64, %o0
219
220 ba,pt %XCC, 60f
221 add %o1, 0x8, %o1
222
22350: /* Destination is 64-byte aligned, and source is 16-byte
224 * aligned.
225 */
2261: EX_LD(LOAD_TWIN(%o1, %o4, %o5))
227 add %o1, 16, %o1
228 EX_LD(LOAD_TWIN(%o1, %g2, %g3))
229 add %o1, 16 + 32, %o1
230 LOAD(prefetch, %o1, #one_read)
231 sub %o1, 32, %o1
232 EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line
233 EX_ST(STORE_INIT(%o5, %o0 + 0x08))
234 EX_LD(LOAD_TWIN(%o1, %o4, %o5))
235 add %o1, 16, %o1
236 EX_ST(STORE_INIT(%g2, %o0 + 0x10))
237 EX_ST(STORE_INIT(%g3, %o0 + 0x18))
238 EX_LD(LOAD_TWIN(%o1, %g2, %g3))
239 add %o1, 16, %o1
240 EX_ST(STORE_INIT(%o4, %o0 + 0x20))
241 EX_ST(STORE_INIT(%o5, %o0 + 0x28))
242 EX_ST(STORE_INIT(%g2, %o0 + 0x30))
243 EX_ST(STORE_INIT(%g3, %o0 + 0x38))
244 subcc %g1, 64, %g1
245 bne,pt %XCC, 1b
246 add %o0, 64, %o0
247 /* fall through */
248
24960:
250 /* %o2 contains any final bytes still needed to be copied
251 * over. If anything is left, we copy it one byte at a time.
252 */
253 RESTORE_ASI(%o3)
254 brz,pt %o2, 85f
255 sub %o0, %o1, %o3
256 ba,a,pt %XCC, 90f
257
258 .align 64
25970: /* 16 < len <= 64 */
260 bne,pn %XCC, 75f
261 sub %o0, %o1, %o3
262
26372:
264 andn %o2, 0xf, %o4
265 and %o2, 0xf, %o2
2661: subcc %o4, 0x10, %o4
267 EX_LD(LOAD(ldx, %o1, %o5))
268 add %o1, 0x08, %o1
269 EX_LD(LOAD(ldx, %o1, %g1))
270 sub %o1, 0x08, %o1
271 EX_ST(STORE(stx, %o5, %o1 + %o3))
272 add %o1, 0x8, %o1
273 EX_ST(STORE(stx, %g1, %o1 + %o3))
274 bgu,pt %XCC, 1b
275 add %o1, 0x8, %o1
27673: andcc %o2, 0x8, %g0
277 be,pt %XCC, 1f
278 nop
279 sub %o2, 0x8, %o2
280 EX_LD(LOAD(ldx, %o1, %o5))
281 EX_ST(STORE(stx, %o5, %o1 + %o3))
282 add %o1, 0x8, %o1
2831: andcc %o2, 0x4, %g0
284 be,pt %XCC, 1f
285 nop
286 sub %o2, 0x4, %o2
287 EX_LD(LOAD(lduw, %o1, %o5))
288 EX_ST(STORE(stw, %o5, %o1 + %o3))
289 add %o1, 0x4, %o1
2901: cmp %o2, 0
291 be,pt %XCC, 85f
292 nop
293 ba,pt %xcc, 90f
294 nop
295
29675:
297 andcc %o0, 0x7, %g1
298 sub %g1, 0x8, %g1
299 be,pn %icc, 2f
300 sub %g0, %g1, %g1
301 sub %o2, %g1, %o2
302
3031: subcc %g1, 1, %g1
304 EX_LD(LOAD(ldub, %o1, %o5))
305 EX_ST(STORE(stb, %o5, %o1 + %o3))
306 bgu,pt %icc, 1b
307 add %o1, 1, %o1
308
3092: add %o1, %o3, %o0
310 andcc %o1, 0x7, %g1
311 bne,pt %icc, 8f
312 sll %g1, 3, %g1
313
314 cmp %o2, 16
315 bgeu,pt %icc, 72b
316 nop
317 ba,a,pt %xcc, 73b
318
3198: mov 64, %o3
320 andn %o1, 0x7, %o1
321 EX_LD(LOAD(ldx, %o1, %g2))
322 sub %o3, %g1, %o3
323 andn %o2, 0x7, %o4
324 sllx %g2, %g1, %g2
3251: add %o1, 0x8, %o1
326 EX_LD(LOAD(ldx, %o1, %g3))
327 subcc %o4, 0x8, %o4
328 srlx %g3, %o3, %o5
329 or %o5, %g2, %o5
330 EX_ST(STORE(stx, %o5, %o0))
331 add %o0, 0x8, %o0
332 bgu,pt %icc, 1b
333 sllx %g3, %g1, %g2
334
335 srl %g1, 3, %g1
336 andcc %o2, 0x7, %o2
337 be,pn %icc, 85f
338 add %o1, %g1, %o1
339 ba,pt %xcc, 90f
340 sub %o0, %o1, %o3
341
342 .align 64
34380: /* 0 < len <= 16 */
344 andcc %o3, 0x3, %g0
345 bne,pn %XCC, 90f
346 sub %o0, %o1, %o3
347
3481:
349 subcc %o2, 4, %o2
350 EX_LD(LOAD(lduw, %o1, %g1))
351 EX_ST(STORE(stw, %g1, %o1 + %o3))
352 bgu,pt %XCC, 1b
353 add %o1, 4, %o1
354
35585: retl
356 mov EX_RETVAL(GLOBAL_SPARE), %o0
357
358 .align 32
35990:
360 subcc %o2, 1, %o2
361 EX_LD(LOAD(ldub, %o1, %g1))
362 EX_ST(STORE(stb, %g1, %o1 + %o3))
363 bgu,pt %XCC, 90b
364 add %o1, 1, %o1
365 retl
366 mov EX_RETVAL(GLOBAL_SPARE), %o0
367
368 .size FUNC_NAME, .-FUNC_NAME
diff --git a/arch/sparc64/lib/NGpage.S b/arch/sparc64/lib/NGpage.S
new file mode 100644
index 000000000000..7d7c3bb8dcbf
--- /dev/null
+++ b/arch/sparc64/lib/NGpage.S
@@ -0,0 +1,96 @@
1/* NGpage.S: Niagara optimize clear and copy page.
2 *
3 * Copyright (C) 2006 (davem@davemloft.net)
4 */
5
6#include <asm/asi.h>
7#include <asm/page.h>
8
9 .text
10 .align 32
11
12 /* This is heavily simplified from the sun4u variants
13 * because Niagara does not have any D-cache aliasing issues
14 * and also we don't need to use the FPU in order to implement
15 * an optimal page copy/clear.
16 */
17
18NGcopy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */
19 prefetch [%o1 + 0x00], #one_read
20 mov 8, %g1
21 mov 16, %g2
22 mov 24, %g3
23 set PAGE_SIZE, %g7
24
251: ldda [%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
26 ldda [%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
27 prefetch [%o1 + 0x40], #one_read
28 add %o1, 32, %o1
29 stxa %o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
30 stxa %o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
31 ldda [%o1 + %g0] ASI_BLK_INIT_QUAD_LDD_P, %o2
32 stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
33 stxa %o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
34 ldda [%o1 + %g2] ASI_BLK_INIT_QUAD_LDD_P, %o4
35 add %o1, 32, %o1
36 add %o0, 32, %o0
37 stxa %o2, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
38 stxa %o3, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
39 stxa %o4, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
40 stxa %o5, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
41 subcc %g7, 64, %g7
42 bne,pt %xcc, 1b
43 add %o0, 32, %o0
44 retl
45 nop
46
47NGclear_page: /* %o0=dest */
48NGclear_user_page: /* %o0=dest, %o1=vaddr */
49 mov 8, %g1
50 mov 16, %g2
51 mov 24, %g3
52 set PAGE_SIZE, %g7
53
541: stxa %g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
55 stxa %g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
56 stxa %g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
57 stxa %g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
58 add %o0, 32, %o0
59 stxa %g0, [%o0 + %g0] ASI_BLK_INIT_QUAD_LDD_P
60 stxa %g0, [%o0 + %g1] ASI_BLK_INIT_QUAD_LDD_P
61 stxa %g0, [%o0 + %g2] ASI_BLK_INIT_QUAD_LDD_P
62 stxa %g0, [%o0 + %g3] ASI_BLK_INIT_QUAD_LDD_P
63 subcc %g7, 64, %g7
64 bne,pt %xcc, 1b
65 add %o0, 32, %o0
66 retl
67 nop
68
69#define BRANCH_ALWAYS 0x10680000
70#define NOP 0x01000000
71#define NG_DO_PATCH(OLD, NEW) \
72 sethi %hi(NEW), %g1; \
73 or %g1, %lo(NEW), %g1; \
74 sethi %hi(OLD), %g2; \
75 or %g2, %lo(OLD), %g2; \
76 sub %g1, %g2, %g1; \
77 sethi %hi(BRANCH_ALWAYS), %g3; \
78 sll %g1, 11, %g1; \
79 srl %g1, 11 + 2, %g1; \
80 or %g3, %lo(BRANCH_ALWAYS), %g3; \
81 or %g3, %g1, %g3; \
82 stw %g3, [%g2]; \
83 sethi %hi(NOP), %g3; \
84 or %g3, %lo(NOP), %g3; \
85 stw %g3, [%g2 + 0x4]; \
86 flush %g2;
87
88 .globl niagara_patch_pageops
89 .type niagara_patch_pageops,#function
90niagara_patch_pageops:
91 NG_DO_PATCH(copy_user_page, NGcopy_user_page)
92 NG_DO_PATCH(_clear_page, NGclear_page)
93 NG_DO_PATCH(clear_user_page, NGclear_user_page)
94 retl
95 nop
96 .size niagara_patch_pageops,.-niagara_patch_pageops
diff --git a/arch/sparc64/lib/NGpatch.S b/arch/sparc64/lib/NGpatch.S
new file mode 100644
index 000000000000..3b0674fc3366
--- /dev/null
+++ b/arch/sparc64/lib/NGpatch.S
@@ -0,0 +1,33 @@
1/* NGpatch.S: Patch Ultra-I routines with Niagara variant.
2 *
3 * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
4 */
5
6#define BRANCH_ALWAYS 0x10680000
7#define NOP 0x01000000
8#define NG_DO_PATCH(OLD, NEW) \
9 sethi %hi(NEW), %g1; \
10 or %g1, %lo(NEW), %g1; \
11 sethi %hi(OLD), %g2; \
12 or %g2, %lo(OLD), %g2; \
13 sub %g1, %g2, %g1; \
14 sethi %hi(BRANCH_ALWAYS), %g3; \
15 sll %g1, 11, %g1; \
16 srl %g1, 11 + 2, %g1; \
17 or %g3, %lo(BRANCH_ALWAYS), %g3; \
18 or %g3, %g1, %g3; \
19 stw %g3, [%g2]; \
20 sethi %hi(NOP), %g3; \
21 or %g3, %lo(NOP), %g3; \
22 stw %g3, [%g2 + 0x4]; \
23 flush %g2;
24
25 .globl niagara_patch_copyops
26 .type niagara_patch_copyops,#function
27niagara_patch_copyops:
28 NG_DO_PATCH(memcpy, NGmemcpy)
29 NG_DO_PATCH(___copy_from_user, NGcopy_from_user)
30 NG_DO_PATCH(___copy_to_user, NGcopy_to_user)
31 retl
32 nop
33 .size niagara_patch_copyops,.-niagara_patch_copyops
diff --git a/arch/sparc64/lib/U3patch.S b/arch/sparc64/lib/U3patch.S
index e2b6c5e4b95a..ecc302619a6e 100644
--- a/arch/sparc64/lib/U3patch.S
+++ b/arch/sparc64/lib/U3patch.S
@@ -12,7 +12,8 @@
12 or %g2, %lo(OLD), %g2; \ 12 or %g2, %lo(OLD), %g2; \
13 sub %g1, %g2, %g1; \ 13 sub %g1, %g2, %g1; \
14 sethi %hi(BRANCH_ALWAYS), %g3; \ 14 sethi %hi(BRANCH_ALWAYS), %g3; \
15 srl %g1, 2, %g1; \ 15 sll %g1, 11, %g1; \
16 srl %g1, 11 + 2, %g1; \
16 or %g3, %lo(BRANCH_ALWAYS), %g3; \ 17 or %g3, %lo(BRANCH_ALWAYS), %g3; \
17 or %g3, %g1, %g3; \ 18 or %g3, %g1, %g3; \
18 stw %g3, [%g2]; \ 19 stw %g3, [%g2]; \
diff --git a/arch/sparc64/lib/bzero.S b/arch/sparc64/lib/bzero.S
index 1d2abcfa4e52..c7bbae8c590f 100644
--- a/arch/sparc64/lib/bzero.S
+++ b/arch/sparc64/lib/bzero.S
@@ -98,12 +98,12 @@ __bzero_done:
98 .text; \ 98 .text; \
99 .align 4; 99 .align 4;
100 100
101 .globl __bzero_noasi 101 .globl __clear_user
102 .type __bzero_noasi, #function 102 .type __clear_user, #function
103__bzero_noasi: /* %o0=buf, %o1=len */ 103__clear_user: /* %o0=buf, %o1=len */
104 brz,pn %o1, __bzero_noasi_done 104 brz,pn %o1, __clear_user_done
105 cmp %o1, 16 105 cmp %o1, 16
106 bl,pn %icc, __bzero_noasi_tiny 106 bl,pn %icc, __clear_user_tiny
107 EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes) 107 EX_ST(prefetcha [%o0 + 0x00] %asi, #n_writes)
108 andcc %o0, 0x3, %g0 108 andcc %o0, 0x3, %g0
109 be,pt %icc, 2f 109 be,pt %icc, 2f
@@ -145,14 +145,14 @@ __bzero_noasi: /* %o0=buf, %o1=len */
145 subcc %g1, 8, %g1 145 subcc %g1, 8, %g1
146 bne,pt %icc, 5b 146 bne,pt %icc, 5b
147 add %o0, 0x8, %o0 147 add %o0, 0x8, %o0
1486: brz,pt %o1, __bzero_noasi_done 1486: brz,pt %o1, __clear_user_done
149 nop 149 nop
150__bzero_noasi_tiny: 150__clear_user_tiny:
1511: EX_ST(stba %g0, [%o0 + 0x00] %asi) 1511: EX_ST(stba %g0, [%o0 + 0x00] %asi)
152 subcc %o1, 1, %o1 152 subcc %o1, 1, %o1
153 bne,pt %icc, 1b 153 bne,pt %icc, 1b
154 add %o0, 1, %o0 154 add %o0, 1, %o0
155__bzero_noasi_done: 155__clear_user_done:
156 retl 156 retl
157 clr %o0 157 clr %o0
158 .size __bzero_noasi, .-__bzero_noasi 158 .size __clear_user, .-__clear_user
diff --git a/arch/sparc64/lib/clear_page.S b/arch/sparc64/lib/clear_page.S
index b59884ef051d..77e531f6c2a7 100644
--- a/arch/sparc64/lib/clear_page.S
+++ b/arch/sparc64/lib/clear_page.S
@@ -9,6 +9,7 @@
9#include <asm/page.h> 9#include <asm/page.h>
10#include <asm/pgtable.h> 10#include <asm/pgtable.h>
11#include <asm/spitfire.h> 11#include <asm/spitfire.h>
12#include <asm/head.h>
12 13
13 /* What we used to do was lock a TLB entry into a specific 14 /* What we used to do was lock a TLB entry into a specific
14 * TLB slot, clear the page with interrupts disabled, then 15 * TLB slot, clear the page with interrupts disabled, then
@@ -22,9 +23,6 @@
22 * disable preemption during the clear. 23 * disable preemption during the clear.
23 */ 24 */
24 25
25#define TTE_BITS_TOP (_PAGE_VALID | _PAGE_SZBITS)
26#define TTE_BITS_BOTTOM (_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_L | _PAGE_W)
27
28 .text 26 .text
29 27
30 .globl _clear_page 28 .globl _clear_page
@@ -43,12 +41,11 @@ clear_user_page: /* %o0=dest, %o1=vaddr */
43 sethi %hi(PAGE_SIZE), %o4 41 sethi %hi(PAGE_SIZE), %o4
44 42
45 sllx %g2, 32, %g2 43 sllx %g2, 32, %g2
46 sethi %uhi(TTE_BITS_TOP), %g3 44 sethi %hi(PAGE_KERNEL_LOCKED), %g3
47 45
48 sllx %g3, 32, %g3 46 ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
49 sub %o0, %g2, %g1 ! paddr 47 sub %o0, %g2, %g1 ! paddr
50 48
51 or %g3, TTE_BITS_BOTTOM, %g3
52 and %o1, %o4, %o0 ! vaddr D-cache alias bit 49 and %o1, %o4, %o0 ! vaddr D-cache alias bit
53 50
54 or %g1, %g3, %g1 ! TTE data 51 or %g1, %g3, %g1 ! TTE data
@@ -66,7 +63,8 @@ clear_user_page: /* %o0=dest, %o1=vaddr */
66 wrpr %o4, PSTATE_IE, %pstate 63 wrpr %o4, PSTATE_IE, %pstate
67 stxa %o0, [%g3] ASI_DMMU 64 stxa %o0, [%g3] ASI_DMMU
68 stxa %g1, [%g0] ASI_DTLB_DATA_IN 65 stxa %g1, [%g0] ASI_DTLB_DATA_IN
69 flush %g6 66 sethi %hi(KERNBASE), %g1
67 flush %g1
70 wrpr %o4, 0x0, %pstate 68 wrpr %o4, 0x0, %pstate
71 69
72 mov 1, %o4 70 mov 1, %o4
diff --git a/arch/sparc64/lib/copy_page.S b/arch/sparc64/lib/copy_page.S
index feebb14fd27a..37460666a5c3 100644
--- a/arch/sparc64/lib/copy_page.S
+++ b/arch/sparc64/lib/copy_page.S
@@ -23,8 +23,6 @@
23 * disable preemption during the clear. 23 * disable preemption during the clear.
24 */ 24 */
25 25
26#define TTE_BITS_TOP (_PAGE_VALID | _PAGE_SZBITS)
27#define TTE_BITS_BOTTOM (_PAGE_CP | _PAGE_CV | _PAGE_P | _PAGE_L | _PAGE_W)
28#define DCACHE_SIZE (PAGE_SIZE * 2) 26#define DCACHE_SIZE (PAGE_SIZE * 2)
29 27
30#if (PAGE_SHIFT == 13) || (PAGE_SHIFT == 19) 28#if (PAGE_SHIFT == 13) || (PAGE_SHIFT == 19)
@@ -52,13 +50,12 @@ copy_user_page: /* %o0=dest, %o1=src, %o2=vaddr */
52 sethi %hi(PAGE_SIZE), %o3 50 sethi %hi(PAGE_SIZE), %o3
53 51
54 sllx %g2, 32, %g2 52 sllx %g2, 32, %g2
55 sethi %uhi(TTE_BITS_TOP), %g3 53 sethi %hi(PAGE_KERNEL_LOCKED), %g3
56 54
57 sllx %g3, 32, %g3 55 ldx [%g3 + %lo(PAGE_KERNEL_LOCKED)], %g3
58 sub %o0, %g2, %g1 ! dest paddr 56 sub %o0, %g2, %g1 ! dest paddr
59 57
60 sub %o1, %g2, %g2 ! src paddr 58 sub %o1, %g2, %g2 ! src paddr
61 or %g3, TTE_BITS_BOTTOM, %g3
62 59
63 and %o2, %o3, %o0 ! vaddr D-cache alias bit 60 and %o2, %o3, %o0 ! vaddr D-cache alias bit
64 or %g1, %g3, %g1 ! dest TTE data 61 or %g1, %g3, %g1 ! dest TTE data
diff --git a/arch/sparc64/lib/delay.c b/arch/sparc64/lib/delay.c
index e8808727617a..fb27e54a03ee 100644
--- a/arch/sparc64/lib/delay.c
+++ b/arch/sparc64/lib/delay.c
@@ -1,6 +1,6 @@
1/* delay.c: Delay loops for sparc64 1/* delay.c: Delay loops for sparc64
2 * 2 *
3 * Copyright (C) 2004 David S. Miller <davem@redhat.com> 3 * Copyright (C) 2004, 2006 David S. Miller <davem@davemloft.net>
4 * 4 *
5 * Based heavily upon x86 variant which is: 5 * Based heavily upon x86 variant which is:
6 * Copyright (C) 1993 Linus Torvalds 6 * Copyright (C) 1993 Linus Torvalds
@@ -8,19 +8,16 @@
8 */ 8 */
9 9
10#include <linux/delay.h> 10#include <linux/delay.h>
11#include <asm/timer.h>
11 12
12void __delay(unsigned long loops) 13void __delay(unsigned long loops)
13{ 14{
14 __asm__ __volatile__( 15 unsigned long bclock, now;
15" b,pt %%xcc, 1f\n" 16
16" cmp %0, 0\n" 17 bclock = tick_ops->get_tick();
17" .align 32\n" 18 do {
18"1:\n" 19 now = tick_ops->get_tick();
19" bne,pt %%xcc, 1b\n" 20 } while ((now-bclock) < loops);
20" subcc %0, 1, %0\n"
21 : "=&r" (loops)
22 : "0" (loops)
23 : "cc");
24} 21}
25 22
26/* We used to multiply by HZ after shifting down by 32 bits 23/* We used to multiply by HZ after shifting down by 32 bits
diff --git a/arch/sparc64/lib/xor.S b/arch/sparc64/lib/xor.S
index 4cd5d2be1ae1..a79c8888170d 100644
--- a/arch/sparc64/lib/xor.S
+++ b/arch/sparc64/lib/xor.S
@@ -2,9 +2,10 @@
2 * arch/sparc64/lib/xor.S 2 * arch/sparc64/lib/xor.S
3 * 3 *
4 * High speed xor_block operation for RAID4/5 utilizing the 4 * High speed xor_block operation for RAID4/5 utilizing the
5 * UltraSparc Visual Instruction Set. 5 * UltraSparc Visual Instruction Set and Niagara store-init/twin-load.
6 * 6 *
7 * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz) 7 * Copyright (C) 1997, 1999 Jakub Jelinek (jj@ultra.linux.cz)
8 * Copyright (C) 2006 David S. Miller <davem@davemloft.net>
8 */ 9 */
9 10
10#include <asm/visasm.h> 11#include <asm/visasm.h>
@@ -19,6 +20,8 @@
19 */ 20 */
20 .text 21 .text
21 .align 32 22 .align 32
23
24 /* VIS versions. */
22 .globl xor_vis_2 25 .globl xor_vis_2
23 .type xor_vis_2,#function 26 .type xor_vis_2,#function
24xor_vis_2: 27xor_vis_2:
@@ -352,3 +355,298 @@ xor_vis_5:
352 ret 355 ret
353 restore 356 restore
354 .size xor_vis_5, .-xor_vis_5 357 .size xor_vis_5, .-xor_vis_5
358
359 /* Niagara versions. */
360 .globl xor_niagara_2
361 .type xor_niagara_2,#function
362xor_niagara_2: /* %o0=bytes, %o1=dest, %o2=src */
363 save %sp, -192, %sp
364 prefetch [%i1], #n_writes
365 prefetch [%i2], #one_read
366 rd %asi, %g7
367 wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
368 srlx %i0, 6, %g1
369 mov %i1, %i0
370 mov %i2, %i1
3711: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src + 0x00 */
372 ldda [%i1 + 0x10] %asi, %i4 /* %i4/%i5 = src + 0x10 */
373 ldda [%i1 + 0x20] %asi, %g2 /* %g2/%g3 = src + 0x20 */
374 ldda [%i1 + 0x30] %asi, %l0 /* %l0/%l1 = src + 0x30 */
375 prefetch [%i1 + 0x40], #one_read
376 ldda [%i0 + 0x00] %asi, %o0 /* %o0/%o1 = dest + 0x00 */
377 ldda [%i0 + 0x10] %asi, %o2 /* %o2/%o3 = dest + 0x10 */
378 ldda [%i0 + 0x20] %asi, %o4 /* %o4/%o5 = dest + 0x20 */
379 ldda [%i0 + 0x30] %asi, %l2 /* %l2/%l3 = dest + 0x30 */
380 prefetch [%i0 + 0x40], #n_writes
381 xor %o0, %i2, %o0
382 xor %o1, %i3, %o1
383 stxa %o0, [%i0 + 0x00] %asi
384 stxa %o1, [%i0 + 0x08] %asi
385 xor %o2, %i4, %o2
386 xor %o3, %i5, %o3
387 stxa %o2, [%i0 + 0x10] %asi
388 stxa %o3, [%i0 + 0x18] %asi
389 xor %o4, %g2, %o4
390 xor %o5, %g3, %o5
391 stxa %o4, [%i0 + 0x20] %asi
392 stxa %o5, [%i0 + 0x28] %asi
393 xor %l2, %l0, %l2
394 xor %l3, %l1, %l3
395 stxa %l2, [%i0 + 0x30] %asi
396 stxa %l3, [%i0 + 0x38] %asi
397 add %i0, 0x40, %i0
398 subcc %g1, 1, %g1
399 bne,pt %xcc, 1b
400 add %i1, 0x40, %i1
401 membar #Sync
402 wr %g7, 0x0, %asi
403 ret
404 restore
405 .size xor_niagara_2, .-xor_niagara_2
406
407 .globl xor_niagara_3
408 .type xor_niagara_3,#function
409xor_niagara_3: /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2 */
410 save %sp, -192, %sp
411 prefetch [%i1], #n_writes
412 prefetch [%i2], #one_read
413 prefetch [%i3], #one_read
414 rd %asi, %g7
415 wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
416 srlx %i0, 6, %g1
417 mov %i1, %i0
418 mov %i2, %i1
419 mov %i3, %l7
4201: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */
421 ldda [%i1 + 0x10] %asi, %i4 /* %i4/%i5 = src1 + 0x10 */
422 ldda [%l7 + 0x00] %asi, %g2 /* %g2/%g3 = src2 + 0x00 */
423 ldda [%l7 + 0x10] %asi, %l0 /* %l0/%l1 = src2 + 0x10 */
424 ldda [%i0 + 0x00] %asi, %o0 /* %o0/%o1 = dest + 0x00 */
425 ldda [%i0 + 0x10] %asi, %o2 /* %o2/%o3 = dest + 0x10 */
426 xor %g2, %i2, %g2
427 xor %g3, %i3, %g3
428 xor %o0, %g2, %o0
429 xor %o1, %g3, %o1
430 stxa %o0, [%i0 + 0x00] %asi
431 stxa %o1, [%i0 + 0x08] %asi
432 ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */
433 ldda [%l7 + 0x20] %asi, %g2 /* %g2/%g3 = src2 + 0x20 */
434 ldda [%i0 + 0x20] %asi, %o0 /* %o0/%o1 = dest + 0x20 */
435 xor %l0, %i4, %l0
436 xor %l1, %i5, %l1
437 xor %o2, %l0, %o2
438 xor %o3, %l1, %o3
439 stxa %o2, [%i0 + 0x10] %asi
440 stxa %o3, [%i0 + 0x18] %asi
441 ldda [%i1 + 0x30] %asi, %i4 /* %i4/%i5 = src1 + 0x30 */
442 ldda [%l7 + 0x30] %asi, %l0 /* %l0/%l1 = src2 + 0x30 */
443 ldda [%i0 + 0x30] %asi, %o2 /* %o2/%o3 = dest + 0x30 */
444 prefetch [%i1 + 0x40], #one_read
445 prefetch [%l7 + 0x40], #one_read
446 prefetch [%i0 + 0x40], #n_writes
447 xor %g2, %i2, %g2
448 xor %g3, %i3, %g3
449 xor %o0, %g2, %o0
450 xor %o1, %g3, %o1
451 stxa %o0, [%i0 + 0x20] %asi
452 stxa %o1, [%i0 + 0x28] %asi
453 xor %l0, %i4, %l0
454 xor %l1, %i5, %l1
455 xor %o2, %l0, %o2
456 xor %o3, %l1, %o3
457 stxa %o2, [%i0 + 0x30] %asi
458 stxa %o3, [%i0 + 0x38] %asi
459 add %i0, 0x40, %i0
460 add %i1, 0x40, %i1
461 subcc %g1, 1, %g1
462 bne,pt %xcc, 1b
463 add %l7, 0x40, %l7
464 membar #Sync
465 wr %g7, 0x0, %asi
466 ret
467 restore
468 .size xor_niagara_3, .-xor_niagara_3
469
470 .globl xor_niagara_4
471 .type xor_niagara_4,#function
472xor_niagara_4: /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3 */
473 save %sp, -192, %sp
474 prefetch [%i1], #n_writes
475 prefetch [%i2], #one_read
476 prefetch [%i3], #one_read
477 prefetch [%i4], #one_read
478 rd %asi, %g7
479 wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
480 srlx %i0, 6, %g1
481 mov %i1, %i0
482 mov %i2, %i1
483 mov %i3, %l7
484 mov %i4, %l6
4851: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */
486 ldda [%l7 + 0x00] %asi, %i4 /* %i4/%i5 = src2 + 0x00 */
487 ldda [%l6 + 0x00] %asi, %g2 /* %g2/%g3 = src3 + 0x00 */
488 ldda [%i0 + 0x00] %asi, %l0 /* %l0/%l1 = dest + 0x00 */
489 xor %i4, %i2, %i4
490 xor %i5, %i3, %i5
491 ldda [%i1 + 0x10] %asi, %i2 /* %i2/%i3 = src1 + 0x10 */
492 xor %g2, %i4, %g2
493 xor %g3, %i5, %g3
494 ldda [%i7 + 0x10] %asi, %i4 /* %i4/%i5 = src2 + 0x10 */
495 xor %l0, %g2, %l0
496 xor %l1, %g3, %l1
497 stxa %l0, [%i0 + 0x00] %asi
498 stxa %l1, [%i0 + 0x08] %asi
499 ldda [%i6 + 0x10] %asi, %g2 /* %g2/%g3 = src3 + 0x10 */
500 ldda [%i0 + 0x10] %asi, %l0 /* %l0/%l1 = dest + 0x10 */
501
502 xor %i4, %i2, %i4
503 xor %i5, %i3, %i5
504 ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */
505 xor %g2, %i4, %g2
506 xor %g3, %i5, %g3
507 ldda [%i7 + 0x20] %asi, %i4 /* %i4/%i5 = src2 + 0x20 */
508 xor %l0, %g2, %l0
509 xor %l1, %g3, %l1
510 stxa %l0, [%i0 + 0x10] %asi
511 stxa %l1, [%i0 + 0x18] %asi
512 ldda [%i6 + 0x20] %asi, %g2 /* %g2/%g3 = src3 + 0x20 */
513 ldda [%i0 + 0x20] %asi, %l0 /* %l0/%l1 = dest + 0x20 */
514
515 xor %i4, %i2, %i4
516 xor %i5, %i3, %i5
517 ldda [%i1 + 0x30] %asi, %i2 /* %i2/%i3 = src1 + 0x30 */
518 xor %g2, %i4, %g2
519 xor %g3, %i5, %g3
520 ldda [%i7 + 0x30] %asi, %i4 /* %i4/%i5 = src2 + 0x30 */
521 xor %l0, %g2, %l0
522 xor %l1, %g3, %l1
523 stxa %l0, [%i0 + 0x20] %asi
524 stxa %l1, [%i0 + 0x28] %asi
525 ldda [%i6 + 0x30] %asi, %g2 /* %g2/%g3 = src3 + 0x30 */
526 ldda [%i0 + 0x30] %asi, %l0 /* %l0/%l1 = dest + 0x30 */
527
528 prefetch [%i1 + 0x40], #one_read
529 prefetch [%l7 + 0x40], #one_read
530 prefetch [%l6 + 0x40], #one_read
531 prefetch [%i0 + 0x40], #n_writes
532
533 xor %i4, %i2, %i4
534 xor %i5, %i3, %i5
535 xor %g2, %i4, %g2
536 xor %g3, %i5, %g3
537 xor %l0, %g2, %l0
538 xor %l1, %g3, %l1
539 stxa %l0, [%i0 + 0x30] %asi
540 stxa %l1, [%i0 + 0x38] %asi
541
542 add %i0, 0x40, %i0
543 add %i1, 0x40, %i1
544 add %l7, 0x40, %l7
545 subcc %g1, 1, %g1
546 bne,pt %xcc, 1b
547 add %l6, 0x40, %l6
548 membar #Sync
549 wr %g7, 0x0, %asi
550 ret
551 restore
552 .size xor_niagara_4, .-xor_niagara_4
553
554 .globl xor_niagara_5
555 .type xor_niagara_5,#function
556xor_niagara_5: /* %o0=bytes, %o1=dest, %o2=src1, %o3=src2, %o4=src3, %o5=src4 */
557 save %sp, -192, %sp
558 prefetch [%i1], #n_writes
559 prefetch [%i2], #one_read
560 prefetch [%i3], #one_read
561 prefetch [%i4], #one_read
562 prefetch [%i5], #one_read
563 rd %asi, %g7
564 wr %g0, ASI_BLK_INIT_QUAD_LDD_P, %asi
565 srlx %i0, 6, %g1
566 mov %i1, %i0
567 mov %i2, %i1
568 mov %i3, %l7
569 mov %i4, %l6
570 mov %i5, %l5
5711: ldda [%i1 + 0x00] %asi, %i2 /* %i2/%i3 = src1 + 0x00 */
572 ldda [%l7 + 0x00] %asi, %i4 /* %i4/%i5 = src2 + 0x00 */
573 ldda [%l6 + 0x00] %asi, %g2 /* %g2/%g3 = src3 + 0x00 */
574 ldda [%l5 + 0x00] %asi, %l0 /* %l0/%l1 = src4 + 0x00 */
575 ldda [%i0 + 0x00] %asi, %l2 /* %l2/%l3 = dest + 0x00 */
576 xor %i4, %i2, %i4
577 xor %i5, %i3, %i5
578 ldda [%i1 + 0x10] %asi, %i2 /* %i2/%i3 = src1 + 0x10 */
579 xor %g2, %i4, %g2
580 xor %g3, %i5, %g3
581 ldda [%l7 + 0x10] %asi, %i4 /* %i4/%i5 = src2 + 0x10 */
582 xor %l0, %g2, %l0
583 xor %l1, %g3, %l1
584 ldda [%l6 + 0x10] %asi, %g2 /* %g2/%g3 = src3 + 0x10 */
585 xor %l2, %l0, %l2
586 xor %l3, %l1, %l3
587 stxa %l2, [%i0 + 0x00] %asi
588 stxa %l3, [%i0 + 0x08] %asi
589 ldda [%l5 + 0x10] %asi, %l0 /* %l0/%l1 = src4 + 0x10 */
590 ldda [%i0 + 0x10] %asi, %l2 /* %l2/%l3 = dest + 0x10 */
591
592 xor %i4, %i2, %i4
593 xor %i5, %i3, %i5
594 ldda [%i1 + 0x20] %asi, %i2 /* %i2/%i3 = src1 + 0x20 */
595 xor %g2, %i4, %g2
596 xor %g3, %i5, %g3
597 ldda [%l7 + 0x20] %asi, %i4 /* %i4/%i5 = src2 + 0x20 */
598 xor %l0, %g2, %l0
599 xor %l1, %g3, %l1
600 ldda [%l6 + 0x20] %asi, %g2 /* %g2/%g3 = src3 + 0x20 */
601 xor %l2, %l0, %l2
602 xor %l3, %l1, %l3
603 stxa %l2, [%i0 + 0x10] %asi
604 stxa %l3, [%i0 + 0x18] %asi
605 ldda [%l5 + 0x20] %asi, %l0 /* %l0/%l1 = src4 + 0x20 */
606 ldda [%i0 + 0x20] %asi, %l2 /* %l2/%l3 = dest + 0x20 */
607
608 xor %i4, %i2, %i4
609 xor %i5, %i3, %i5
610 ldda [%i1 + 0x30] %asi, %i2 /* %i2/%i3 = src1 + 0x30 */
611 xor %g2, %i4, %g2
612 xor %g3, %i5, %g3
613 ldda [%l7 + 0x30] %asi, %i4 /* %i4/%i5 = src2 + 0x30 */
614 xor %l0, %g2, %l0
615 xor %l1, %g3, %l1
616 ldda [%l6 + 0x30] %asi, %g2 /* %g2/%g3 = src3 + 0x30 */
617 xor %l2, %l0, %l2
618 xor %l3, %l1, %l3
619 stxa %l2, [%i0 + 0x20] %asi
620 stxa %l3, [%i0 + 0x28] %asi
621 ldda [%l5 + 0x30] %asi, %l0 /* %l0/%l1 = src4 + 0x30 */
622 ldda [%i0 + 0x30] %asi, %l2 /* %l2/%l3 = dest + 0x30 */
623
624 prefetch [%i1 + 0x40], #one_read
625 prefetch [%l7 + 0x40], #one_read
626 prefetch [%l6 + 0x40], #one_read
627 prefetch [%l5 + 0x40], #one_read
628 prefetch [%i0 + 0x40], #n_writes
629
630 xor %i4, %i2, %i4
631 xor %i5, %i3, %i5
632 xor %g2, %i4, %g2
633 xor %g3, %i5, %g3
634 xor %l0, %g2, %l0
635 xor %l1, %g3, %l1
636 xor %l2, %l0, %l2
637 xor %l3, %l1, %l3
638 stxa %l2, [%i0 + 0x30] %asi
639 stxa %l3, [%i0 + 0x38] %asi
640
641 add %i0, 0x40, %i0
642 add %i1, 0x40, %i1
643 add %l7, 0x40, %l7
644 add %l6, 0x40, %l6
645 subcc %g1, 1, %g1
646 bne,pt %xcc, 1b
647 add %l5, 0x40, %l5
648 membar #Sync
649 wr %g7, 0x0, %asi
650 ret
651 restore
652 .size xor_niagara_5, .-xor_niagara_5