aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid S. Miller <davem@sunset.davemloft.net>2007-10-02 04:03:09 -0400
committerDavid S. Miller <davem@sunset.davemloft.net>2007-10-02 04:03:09 -0400
commit25e5566ed38650f7990041fcd20571d6ddd2a040 (patch)
tree26d5bf006a6945930102fa01ad0edcd479b39bbf
parent8cc8c28a9acdceda0e60519167a052cc3408c5c3 (diff)
[SPARC64]: Fix missing load-twin usage in Niagara-1 memcpy.
For the case where the source is not aligned modulo 8 we don't use load-twins to suck the data in and this kills performance since normal loads allocate in the L1 cache (unlike load-twin) and thus big memcpys swipe the entire L1 D-cache. We need to allocate a register window to implement this properly, but that actually simplifies a lot of things as a nice side-effect. Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--arch/sparc64/lib/NGcopy_from_user.S8
-rw-r--r--arch/sparc64/lib/NGcopy_to_user.S8
-rw-r--r--arch/sparc64/lib/NGmemcpy.S371
3 files changed, 221 insertions, 166 deletions
diff --git a/arch/sparc64/lib/NGcopy_from_user.S b/arch/sparc64/lib/NGcopy_from_user.S
index 2d93456f76dd..e7f433f71b42 100644
--- a/arch/sparc64/lib/NGcopy_from_user.S
+++ b/arch/sparc64/lib/NGcopy_from_user.S
@@ -1,6 +1,6 @@
1/* NGcopy_from_user.S: Niagara optimized copy from userspace. 1/* NGcopy_from_user.S: Niagara optimized copy from userspace.
2 * 2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net) 3 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
4 */ 4 */
5 5
6#define EX_LD(x) \ 6#define EX_LD(x) \
@@ -8,8 +8,8 @@
8 .section .fixup; \ 8 .section .fixup; \
9 .align 4; \ 9 .align 4; \
1099: wr %g0, ASI_AIUS, %asi;\ 1099: wr %g0, ASI_AIUS, %asi;\
11 retl; \ 11 ret; \
12 mov 1, %o0; \ 12 restore %g0, 1, %o0; \
13 .section __ex_table,"a";\ 13 .section __ex_table,"a";\
14 .align 4; \ 14 .align 4; \
15 .word 98b, 99b; \ 15 .word 98b, 99b; \
@@ -24,7 +24,7 @@
24#define LOAD(type,addr,dest) type##a [addr] ASI_AIUS, dest 24#define LOAD(type,addr,dest) type##a [addr] ASI_AIUS, dest
25#define LOAD_TWIN(addr_reg,dest0,dest1) \ 25#define LOAD_TWIN(addr_reg,dest0,dest1) \
26 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_AIUS, dest0 26 ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_AIUS, dest0
27#define EX_RETVAL(x) 0 27#define EX_RETVAL(x) %g0
28 28
29#ifdef __KERNEL__ 29#ifdef __KERNEL__
30#define PREAMBLE \ 30#define PREAMBLE \
diff --git a/arch/sparc64/lib/NGcopy_to_user.S b/arch/sparc64/lib/NGcopy_to_user.S
index 34112d5054ef..6ea01c5532a0 100644
--- a/arch/sparc64/lib/NGcopy_to_user.S
+++ b/arch/sparc64/lib/NGcopy_to_user.S
@@ -1,6 +1,6 @@
1/* NGcopy_to_user.S: Niagara optimized copy to userspace. 1/* NGcopy_to_user.S: Niagara optimized copy to userspace.
2 * 2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net) 3 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
4 */ 4 */
5 5
6#define EX_ST(x) \ 6#define EX_ST(x) \
@@ -8,8 +8,8 @@
8 .section .fixup; \ 8 .section .fixup; \
9 .align 4; \ 9 .align 4; \
1099: wr %g0, ASI_AIUS, %asi;\ 1099: wr %g0, ASI_AIUS, %asi;\
11 retl; \ 11 ret; \
12 mov 1, %o0; \ 12 restore %g0, 1, %o0; \
13 .section __ex_table,"a";\ 13 .section __ex_table,"a";\
14 .align 4; \ 14 .align 4; \
15 .word 98b, 99b; \ 15 .word 98b, 99b; \
@@ -23,7 +23,7 @@
23#define FUNC_NAME NGcopy_to_user 23#define FUNC_NAME NGcopy_to_user
24#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS 24#define STORE(type,src,addr) type##a src, [addr] ASI_AIUS
25#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS 25#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_AIUS
26#define EX_RETVAL(x) 0 26#define EX_RETVAL(x) %g0
27 27
28#ifdef __KERNEL__ 28#ifdef __KERNEL__
29 /* Writing to %asi is _expensive_ so we hardcode it. 29 /* Writing to %asi is _expensive_ so we hardcode it.
diff --git a/arch/sparc64/lib/NGmemcpy.S b/arch/sparc64/lib/NGmemcpy.S
index 66063a9a66b8..605cb3f09900 100644
--- a/arch/sparc64/lib/NGmemcpy.S
+++ b/arch/sparc64/lib/NGmemcpy.S
@@ -1,6 +1,6 @@
1/* NGmemcpy.S: Niagara optimized memcpy. 1/* NGmemcpy.S: Niagara optimized memcpy.
2 * 2 *
3 * Copyright (C) 2006 David S. Miller (davem@davemloft.net) 3 * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net)
4 */ 4 */
5 5
6#ifdef __KERNEL__ 6#ifdef __KERNEL__
@@ -16,6 +16,12 @@
16 wr %g0, ASI_PNF, %asi 16 wr %g0, ASI_PNF, %asi
17#endif 17#endif
18 18
19#ifdef __sparc_v9__
20#define SAVE_AMOUNT 128
21#else
22#define SAVE_AMOUNT 64
23#endif
24
19#ifndef STORE_ASI 25#ifndef STORE_ASI
20#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P 26#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
21#endif 27#endif
@@ -50,7 +56,11 @@
50#endif 56#endif
51 57
52#ifndef STORE_INIT 58#ifndef STORE_INIT
59#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
53#define STORE_INIT(src,addr) stxa src, [addr] %asi 60#define STORE_INIT(src,addr) stxa src, [addr] %asi
61#else
62#define STORE_INIT(src,addr) stx src, [addr + 0x00]
63#endif
54#endif 64#endif
55 65
56#ifndef FUNC_NAME 66#ifndef FUNC_NAME
@@ -73,18 +83,19 @@
73 83
74 .globl FUNC_NAME 84 .globl FUNC_NAME
75 .type FUNC_NAME,#function 85 .type FUNC_NAME,#function
76FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ 86FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */
77 srlx %o2, 31, %g2 87 PREAMBLE
88 save %sp, -SAVE_AMOUNT, %sp
89 srlx %i2, 31, %g2
78 cmp %g2, 0 90 cmp %g2, 0
79 tne %xcc, 5 91 tne %xcc, 5
80 PREAMBLE 92 mov %i0, %o0
81 mov %o0, GLOBAL_SPARE 93 cmp %i2, 0
82 cmp %o2, 0
83 be,pn %XCC, 85f 94 be,pn %XCC, 85f
84 or %o0, %o1, %o3 95 or %o0, %i1, %i3
85 cmp %o2, 16 96 cmp %i2, 16
86 blu,a,pn %XCC, 80f 97 blu,a,pn %XCC, 80f
87 or %o3, %o2, %o3 98 or %i3, %i2, %i3
88 99
89 /* 2 blocks (128 bytes) is the minimum we can do the block 100 /* 2 blocks (128 bytes) is the minimum we can do the block
90 * copy with. We need to ensure that we'll iterate at least 101 * copy with. We need to ensure that we'll iterate at least
@@ -93,31 +104,31 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
93 * to (64 - 1) bytes from the length before we perform the 104 * to (64 - 1) bytes from the length before we perform the
94 * block copy loop. 105 * block copy loop.
95 */ 106 */
96 cmp %o2, (2 * 64) 107 cmp %i2, (2 * 64)
97 blu,pt %XCC, 70f 108 blu,pt %XCC, 70f
98 andcc %o3, 0x7, %g0 109 andcc %i3, 0x7, %g0
99 110
100 /* %o0: dst 111 /* %o0: dst
101 * %o1: src 112 * %i1: src
102 * %o2: len (known to be >= 128) 113 * %i2: len (known to be >= 128)
103 * 114 *
104 * The block copy loops will use %o4/%o5,%g2/%g3 as 115 * The block copy loops will use %i4/%i5,%g2/%g3 as
105 * temporaries while copying the data. 116 * temporaries while copying the data.
106 */ 117 */
107 118
108 LOAD(prefetch, %o1, #one_read) 119 LOAD(prefetch, %i1, #one_read)
109 wr %g0, STORE_ASI, %asi 120 wr %g0, STORE_ASI, %asi
110 121
111 /* Align destination on 64-byte boundary. */ 122 /* Align destination on 64-byte boundary. */
112 andcc %o0, (64 - 1), %o4 123 andcc %o0, (64 - 1), %i4
113 be,pt %XCC, 2f 124 be,pt %XCC, 2f
114 sub %o4, 64, %o4 125 sub %i4, 64, %i4
115 sub %g0, %o4, %o4 ! bytes to align dst 126 sub %g0, %i4, %i4 ! bytes to align dst
116 sub %o2, %o4, %o2 127 sub %i2, %i4, %i2
1171: subcc %o4, 1, %o4 1281: subcc %i4, 1, %i4
118 EX_LD(LOAD(ldub, %o1, %g1)) 129 EX_LD(LOAD(ldub, %i1, %g1))
119 EX_ST(STORE(stb, %g1, %o0)) 130 EX_ST(STORE(stb, %g1, %o0))
120 add %o1, 1, %o1 131 add %i1, 1, %i1
121 bne,pt %XCC, 1b 132 bne,pt %XCC, 1b
122 add %o0, 1, %o0 133 add %o0, 1, %o0
123 134
@@ -136,111 +147,155 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
136 * aligned store data at a time, this is easy to ensure. 147 * aligned store data at a time, this is easy to ensure.
137 */ 148 */
1382: 1492:
139 andcc %o1, (16 - 1), %o4 150 andcc %i1, (16 - 1), %i4
140 andn %o2, (64 - 1), %g1 ! block copy loop iterator 151 andn %i2, (64 - 1), %g1 ! block copy loop iterator
141 sub %o2, %g1, %o2 ! final sub-block copy bytes
142 be,pt %XCC, 50f 152 be,pt %XCC, 50f
143 cmp %o4, 8 153 sub %i2, %g1, %i2 ! final sub-block copy bytes
144 be,a,pt %XCC, 10f 154
145 sub %o1, 0x8, %o1 155 cmp %i4, 8
156 be,pt %XCC, 10f
157 sub %i1, %i4, %i1
146 158
147 /* Neither 8-byte nor 16-byte aligned, shift and mask. */ 159 /* Neither 8-byte nor 16-byte aligned, shift and mask. */
148 mov %g1, %o4 160 and %i4, 0x7, GLOBAL_SPARE
149 and %o1, 0x7, %g1 161 sll GLOBAL_SPARE, 3, GLOBAL_SPARE
150 sll %g1, 3, %g1 162 mov 64, %i5
151 mov 64, %o3 163 EX_LD(LOAD_TWIN(%i1, %g2, %g3))
152 andn %o1, 0x7, %o1 164 sub %i5, GLOBAL_SPARE, %i5
153 EX_LD(LOAD(ldx, %o1, %g2)) 165 mov 16, %o4
154 sub %o3, %g1, %o3 166 mov 32, %o5
155 sllx %g2, %g1, %g2 167 mov 48, %o7
168 mov 64, %i3
169
170 bg,pn %XCC, 9f
171 nop
156 172
157#define SWIVEL_ONE_DWORD(SRC, TMP1, TMP2, PRE_VAL, PRE_SHIFT, POST_SHIFT, DST)\ 173#define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \
158 EX_LD(LOAD(ldx, SRC, TMP1)); \ 174 sllx WORD1, POST_SHIFT, WORD1; \
159 srlx TMP1, PRE_SHIFT, TMP2; \ 175 srlx WORD2, PRE_SHIFT, TMP; \
160 or TMP2, PRE_VAL, TMP2; \ 176 sllx WORD2, POST_SHIFT, WORD2; \
161 EX_ST(STORE_INIT(TMP2, DST)); \ 177 or WORD1, TMP, WORD1; \
162 sllx TMP1, POST_SHIFT, PRE_VAL; 178 srlx WORD3, PRE_SHIFT, TMP; \
163 179 or WORD2, TMP, WORD2;
1641: add %o1, 0x8, %o1 180
165 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x00) 1818: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
166 add %o1, 0x8, %o1 182 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
167 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x08) 183 LOAD(prefetch, %i1 + %i3, #one_read)
168 add %o1, 0x8, %o1 184
169 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x10) 185 EX_ST(STORE_INIT(%g2, %o0 + 0x00))
170 add %o1, 0x8, %o1 186 EX_ST(STORE_INIT(%g3, %o0 + 0x08))
171 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x18) 187
172 add %o1, 32, %o1 188 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
173 LOAD(prefetch, %o1, #one_read) 189 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
174 sub %o1, 32 - 8, %o1 190
175 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x20) 191 EX_ST(STORE_INIT(%o2, %o0 + 0x10))
176 add %o1, 8, %o1 192 EX_ST(STORE_INIT(%o3, %o0 + 0x18))
177 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x28) 193
178 add %o1, 8, %o1 194 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
179 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x30) 195 MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1)
180 add %o1, 8, %o1 196
181 SWIVEL_ONE_DWORD(%o1, %g3, %o5, %g2, %o3, %g1, %o0 + 0x38) 197 EX_ST(STORE_INIT(%g2, %o0 + 0x20))
182 subcc %o4, 64, %o4 198 EX_ST(STORE_INIT(%g3, %o0 + 0x28))
183 bne,pt %XCC, 1b 199
200 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
201 add %i1, 64, %i1
202 MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1)
203
204 EX_ST(STORE_INIT(%o2, %o0 + 0x30))
205 EX_ST(STORE_INIT(%o3, %o0 + 0x38))
206
207 subcc %g1, 64, %g1
208 bne,pt %XCC, 8b
184 add %o0, 64, %o0 209 add %o0, 64, %o0
185 210
186#undef SWIVEL_ONE_DWORD 211 ba,pt %XCC, 60f
212 add %i1, %i4, %i1
213
2149: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3))
215 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
216 LOAD(prefetch, %i1 + %i3, #one_read)
217
218 EX_ST(STORE_INIT(%g3, %o0 + 0x00))
219 EX_ST(STORE_INIT(%o2, %o0 + 0x08))
220
221 EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3))
222 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
223
224 EX_ST(STORE_INIT(%o3, %o0 + 0x10))
225 EX_ST(STORE_INIT(%g2, %o0 + 0x18))
226
227 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
228 MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1)
229
230 EX_ST(STORE_INIT(%g3, %o0 + 0x20))
231 EX_ST(STORE_INIT(%o2, %o0 + 0x28))
232
233 EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3))
234 add %i1, 64, %i1
235 MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1)
236
237 EX_ST(STORE_INIT(%o3, %o0 + 0x30))
238 EX_ST(STORE_INIT(%g2, %o0 + 0x38))
239
240 subcc %g1, 64, %g1
241 bne,pt %XCC, 9b
242 add %o0, 64, %o0
187 243
188 srl %g1, 3, %g1
189 ba,pt %XCC, 60f 244 ba,pt %XCC, 60f
190 add %o1, %g1, %o1 245 add %i1, %i4, %i1
191 246
19210: /* Destination is 64-byte aligned, source was only 8-byte 24710: /* Destination is 64-byte aligned, source was only 8-byte
193 * aligned but it has been subtracted by 8 and we perform 248 * aligned but it has been subtracted by 8 and we perform
194 * one twin load ahead, then add 8 back into source when 249 * one twin load ahead, then add 8 back into source when
195 * we finish the loop. 250 * we finish the loop.
196 */ 251 */
197 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 252 EX_LD(LOAD_TWIN(%i1, %o4, %o5))
1981: add %o1, 16, %o1 253 mov 16, %o7
199 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 254 mov 32, %g2
200 add %o1, 16 + 32, %o1 255 mov 48, %g3
201 LOAD(prefetch, %o1, #one_read) 256 mov 64, %o1
202 sub %o1, 32, %o1 2571: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
258 LOAD(prefetch, %i1 + %o1, #one_read)
203 EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line 259 EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line
204 EX_ST(STORE_INIT(%g2, %o0 + 0x08)) 260 EX_ST(STORE_INIT(%o2, %o0 + 0x08))
205 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 261 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
206 add %o1, 16, %o1 262 EX_ST(STORE_INIT(%o3, %o0 + 0x10))
207 EX_ST(STORE_INIT(%g3, %o0 + 0x10))
208 EX_ST(STORE_INIT(%o4, %o0 + 0x18)) 263 EX_ST(STORE_INIT(%o4, %o0 + 0x18))
209 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 264 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
210 add %o1, 16, %o1
211 EX_ST(STORE_INIT(%o5, %o0 + 0x20)) 265 EX_ST(STORE_INIT(%o5, %o0 + 0x20))
212 EX_ST(STORE_INIT(%g2, %o0 + 0x28)) 266 EX_ST(STORE_INIT(%o2, %o0 + 0x28))
213 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 267 EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5))
214 EX_ST(STORE_INIT(%g3, %o0 + 0x30)) 268 add %i1, 64, %i1
269 EX_ST(STORE_INIT(%o3, %o0 + 0x30))
215 EX_ST(STORE_INIT(%o4, %o0 + 0x38)) 270 EX_ST(STORE_INIT(%o4, %o0 + 0x38))
216 subcc %g1, 64, %g1 271 subcc %g1, 64, %g1
217 bne,pt %XCC, 1b 272 bne,pt %XCC, 1b
218 add %o0, 64, %o0 273 add %o0, 64, %o0
219 274
220 ba,pt %XCC, 60f 275 ba,pt %XCC, 60f
221 add %o1, 0x8, %o1 276 add %i1, 0x8, %i1
222 277
22350: /* Destination is 64-byte aligned, and source is 16-byte 27850: /* Destination is 64-byte aligned, and source is 16-byte
224 * aligned. 279 * aligned.
225 */ 280 */
2261: EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 281 mov 16, %o7
227 add %o1, 16, %o1 282 mov 32, %g2
228 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 283 mov 48, %g3
229 add %o1, 16 + 32, %o1 284 mov 64, %o1
230 LOAD(prefetch, %o1, #one_read) 2851: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5))
231 sub %o1, 32, %o1 286 EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3))
287 LOAD(prefetch, %i1 + %o1, #one_read)
232 EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line 288 EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line
233 EX_ST(STORE_INIT(%o5, %o0 + 0x08)) 289 EX_ST(STORE_INIT(%o5, %o0 + 0x08))
234 EX_LD(LOAD_TWIN(%o1, %o4, %o5)) 290 EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5))
235 add %o1, 16, %o1 291 EX_ST(STORE_INIT(%o2, %o0 + 0x10))
236 EX_ST(STORE_INIT(%g2, %o0 + 0x10)) 292 EX_ST(STORE_INIT(%o3, %o0 + 0x18))
237 EX_ST(STORE_INIT(%g3, %o0 + 0x18)) 293 EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3))
238 EX_LD(LOAD_TWIN(%o1, %g2, %g3)) 294 add %i1, 64, %i1
239 add %o1, 16, %o1
240 EX_ST(STORE_INIT(%o4, %o0 + 0x20)) 295 EX_ST(STORE_INIT(%o4, %o0 + 0x20))
241 EX_ST(STORE_INIT(%o5, %o0 + 0x28)) 296 EX_ST(STORE_INIT(%o5, %o0 + 0x28))
242 EX_ST(STORE_INIT(%g2, %o0 + 0x30)) 297 EX_ST(STORE_INIT(%o2, %o0 + 0x30))
243 EX_ST(STORE_INIT(%g3, %o0 + 0x38)) 298 EX_ST(STORE_INIT(%o3, %o0 + 0x38))
244 subcc %g1, 64, %g1 299 subcc %g1, 64, %g1
245 bne,pt %XCC, 1b 300 bne,pt %XCC, 1b
246 add %o0, 64, %o0 301 add %o0, 64, %o0
@@ -249,47 +304,47 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
24960: 30460:
250 membar #Sync 305 membar #Sync
251 306
252 /* %o2 contains any final bytes still needed to be copied 307 /* %i2 contains any final bytes still needed to be copied
253 * over. If anything is left, we copy it one byte at a time. 308 * over. If anything is left, we copy it one byte at a time.
254 */ 309 */
255 RESTORE_ASI(%o3) 310 RESTORE_ASI(%i3)
256 brz,pt %o2, 85f 311 brz,pt %i2, 85f
257 sub %o0, %o1, %o3 312 sub %o0, %i1, %i3
258 ba,a,pt %XCC, 90f 313 ba,a,pt %XCC, 90f
259 314
260 .align 64 315 .align 64
26170: /* 16 < len <= 64 */ 31670: /* 16 < len <= 64 */
262 bne,pn %XCC, 75f 317 bne,pn %XCC, 75f
263 sub %o0, %o1, %o3 318 sub %o0, %i1, %i3
264 319
26572: 32072:
266 andn %o2, 0xf, %o4 321 andn %i2, 0xf, %i4
267 and %o2, 0xf, %o2 322 and %i2, 0xf, %i2
2681: subcc %o4, 0x10, %o4 3231: subcc %i4, 0x10, %i4
269 EX_LD(LOAD(ldx, %o1, %o5)) 324 EX_LD(LOAD(ldx, %i1, %i5))
270 add %o1, 0x08, %o1 325 add %i1, 0x08, %i1
271 EX_LD(LOAD(ldx, %o1, %g1)) 326 EX_LD(LOAD(ldx, %i1, %g1))
272 sub %o1, 0x08, %o1 327 sub %i1, 0x08, %i1
273 EX_ST(STORE(stx, %o5, %o1 + %o3)) 328 EX_ST(STORE(stx, %i5, %i1 + %i3))
274 add %o1, 0x8, %o1 329 add %i1, 0x8, %i1
275 EX_ST(STORE(stx, %g1, %o1 + %o3)) 330 EX_ST(STORE(stx, %g1, %i1 + %i3))
276 bgu,pt %XCC, 1b 331 bgu,pt %XCC, 1b
277 add %o1, 0x8, %o1 332 add %i1, 0x8, %i1
27873: andcc %o2, 0x8, %g0 33373: andcc %i2, 0x8, %g0
279 be,pt %XCC, 1f 334 be,pt %XCC, 1f
280 nop 335 nop
281 sub %o2, 0x8, %o2 336 sub %i2, 0x8, %i2
282 EX_LD(LOAD(ldx, %o1, %o5)) 337 EX_LD(LOAD(ldx, %i1, %i5))
283 EX_ST(STORE(stx, %o5, %o1 + %o3)) 338 EX_ST(STORE(stx, %i5, %i1 + %i3))
284 add %o1, 0x8, %o1 339 add %i1, 0x8, %i1
2851: andcc %o2, 0x4, %g0 3401: andcc %i2, 0x4, %g0
286 be,pt %XCC, 1f 341 be,pt %XCC, 1f
287 nop 342 nop
288 sub %o2, 0x4, %o2 343 sub %i2, 0x4, %i2
289 EX_LD(LOAD(lduw, %o1, %o5)) 344 EX_LD(LOAD(lduw, %i1, %i5))
290 EX_ST(STORE(stw, %o5, %o1 + %o3)) 345 EX_ST(STORE(stw, %i5, %i1 + %i3))
291 add %o1, 0x4, %o1 346 add %i1, 0x4, %i1
2921: cmp %o2, 0 3471: cmp %i2, 0
293 be,pt %XCC, 85f 348 be,pt %XCC, 85f
294 nop 349 nop
295 ba,pt %xcc, 90f 350 ba,pt %xcc, 90f
@@ -300,71 +355,71 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */
300 sub %g1, 0x8, %g1 355 sub %g1, 0x8, %g1
301 be,pn %icc, 2f 356 be,pn %icc, 2f
302 sub %g0, %g1, %g1 357 sub %g0, %g1, %g1
303 sub %o2, %g1, %o2 358 sub %i2, %g1, %i2
304 359
3051: subcc %g1, 1, %g1 3601: subcc %g1, 1, %g1
306 EX_LD(LOAD(ldub, %o1, %o5)) 361 EX_LD(LOAD(ldub, %i1, %i5))
307 EX_ST(STORE(stb, %o5, %o1 + %o3)) 362 EX_ST(STORE(stb, %i5, %i1 + %i3))
308 bgu,pt %icc, 1b 363 bgu,pt %icc, 1b
309 add %o1, 1, %o1 364 add %i1, 1, %i1
310 365
3112: add %o1, %o3, %o0 3662: add %i1, %i3, %o0
312 andcc %o1, 0x7, %g1 367 andcc %i1, 0x7, %g1
313 bne,pt %icc, 8f 368 bne,pt %icc, 8f
314 sll %g1, 3, %g1 369 sll %g1, 3, %g1
315 370
316 cmp %o2, 16 371 cmp %i2, 16
317 bgeu,pt %icc, 72b 372 bgeu,pt %icc, 72b
318 nop 373 nop
319 ba,a,pt %xcc, 73b 374 ba,a,pt %xcc, 73b
320 375
3218: mov 64, %o3 3768: mov 64, %i3
322 andn %o1, 0x7, %o1 377 andn %i1, 0x7, %i1
323 EX_LD(LOAD(ldx, %o1, %g2)) 378 EX_LD(LOAD(ldx, %i1, %g2))
324 sub %o3, %g1, %o3 379 sub %i3, %g1, %i3
325 andn %o2, 0x7, %o4 380 andn %i2, 0x7, %i4
326 sllx %g2, %g1, %g2 381 sllx %g2, %g1, %g2
3271: add %o1, 0x8, %o1 3821: add %i1, 0x8, %i1
328 EX_LD(LOAD(ldx, %o1, %g3)) 383 EX_LD(LOAD(ldx, %i1, %g3))
329 subcc %o4, 0x8, %o4 384 subcc %i4, 0x8, %i4
330 srlx %g3, %o3, %o5 385 srlx %g3, %i3, %i5
331 or %o5, %g2, %o5 386 or %i5, %g2, %i5
332 EX_ST(STORE(stx, %o5, %o0)) 387 EX_ST(STORE(stx, %i5, %o0))
333 add %o0, 0x8, %o0 388 add %o0, 0x8, %o0
334 bgu,pt %icc, 1b 389 bgu,pt %icc, 1b
335 sllx %g3, %g1, %g2 390 sllx %g3, %g1, %g2
336 391
337 srl %g1, 3, %g1 392 srl %g1, 3, %g1
338 andcc %o2, 0x7, %o2 393 andcc %i2, 0x7, %i2
339 be,pn %icc, 85f 394 be,pn %icc, 85f
340 add %o1, %g1, %o1 395 add %i1, %g1, %i1
341 ba,pt %xcc, 90f 396 ba,pt %xcc, 90f
342 sub %o0, %o1, %o3 397 sub %o0, %i1, %i3
343 398
344 .align 64 399 .align 64
34580: /* 0 < len <= 16 */ 40080: /* 0 < len <= 16 */
346 andcc %o3, 0x3, %g0 401 andcc %i3, 0x3, %g0
347 bne,pn %XCC, 90f 402 bne,pn %XCC, 90f
348 sub %o0, %o1, %o3 403 sub %o0, %i1, %i3
349 404
3501: 4051:
351 subcc %o2, 4, %o2 406 subcc %i2, 4, %i2
352 EX_LD(LOAD(lduw, %o1, %g1)) 407 EX_LD(LOAD(lduw, %i1, %g1))
353 EX_ST(STORE(stw, %g1, %o1 + %o3)) 408 EX_ST(STORE(stw, %g1, %i1 + %i3))
354 bgu,pt %XCC, 1b 409 bgu,pt %XCC, 1b
355 add %o1, 4, %o1 410 add %i1, 4, %i1
356 411
35785: retl 41285: ret
358 mov EX_RETVAL(GLOBAL_SPARE), %o0 413 restore EX_RETVAL(%i0), %g0, %o0
359 414
360 .align 32 415 .align 32
36190: 41690:
362 subcc %o2, 1, %o2 417 subcc %i2, 1, %i2
363 EX_LD(LOAD(ldub, %o1, %g1)) 418 EX_LD(LOAD(ldub, %i1, %g1))
364 EX_ST(STORE(stb, %g1, %o1 + %o3)) 419 EX_ST(STORE(stb, %g1, %i1 + %i3))
365 bgu,pt %XCC, 90b 420 bgu,pt %XCC, 90b
366 add %o1, 1, %o1 421 add %i1, 1, %i1
367 retl 422 ret
368 mov EX_RETVAL(GLOBAL_SPARE), %o0 423 restore EX_RETVAL(%i0), %g0, %o0
369 424
370 .size FUNC_NAME, .-FUNC_NAME 425 .size FUNC_NAME, .-FUNC_NAME