diff options
Diffstat (limited to 'arch/sparc/lib/NGmemcpy.S')
-rw-r--r-- | arch/sparc/lib/NGmemcpy.S | 425 |
1 files changed, 425 insertions, 0 deletions
diff --git a/arch/sparc/lib/NGmemcpy.S b/arch/sparc/lib/NGmemcpy.S new file mode 100644 index 000000000000..96a14caf6966 --- /dev/null +++ b/arch/sparc/lib/NGmemcpy.S | |||
@@ -0,0 +1,425 @@ | |||
1 | /* NGmemcpy.S: Niagara optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #include <asm/asi.h> | ||
8 | #include <asm/thread_info.h> | ||
9 | #define GLOBAL_SPARE %g7 | ||
10 | #define RESTORE_ASI(TMP) \ | ||
11 | ldub [%g6 + TI_CURRENT_DS], TMP; \ | ||
12 | wr TMP, 0x0, %asi; | ||
13 | #else | ||
14 | #define GLOBAL_SPARE %g5 | ||
15 | #define RESTORE_ASI(TMP) \ | ||
16 | wr %g0, ASI_PNF, %asi | ||
17 | #endif | ||
18 | |||
19 | #ifdef __sparc_v9__ | ||
20 | #define SAVE_AMOUNT 128 | ||
21 | #else | ||
22 | #define SAVE_AMOUNT 64 | ||
23 | #endif | ||
24 | |||
25 | #ifndef STORE_ASI | ||
26 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P | ||
27 | #endif | ||
28 | |||
29 | #ifndef EX_LD | ||
30 | #define EX_LD(x) x | ||
31 | #endif | ||
32 | |||
33 | #ifndef EX_ST | ||
34 | #define EX_ST(x) x | ||
35 | #endif | ||
36 | |||
37 | #ifndef EX_RETVAL | ||
38 | #define EX_RETVAL(x) x | ||
39 | #endif | ||
40 | |||
41 | #ifndef LOAD | ||
42 | #ifndef MEMCPY_DEBUG | ||
43 | #define LOAD(type,addr,dest) type [addr], dest | ||
44 | #else | ||
45 | #define LOAD(type,addr,dest) type##a [addr] 0x80, dest | ||
46 | #endif | ||
47 | #endif | ||
48 | |||
49 | #ifndef LOAD_TWIN | ||
50 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ | ||
51 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 | ||
52 | #endif | ||
53 | |||
54 | #ifndef STORE | ||
55 | #define STORE(type,src,addr) type src, [addr] | ||
56 | #endif | ||
57 | |||
58 | #ifndef STORE_INIT | ||
59 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA | ||
60 | #define STORE_INIT(src,addr) stxa src, [addr] %asi | ||
61 | #else | ||
62 | #define STORE_INIT(src,addr) stx src, [addr + 0x00] | ||
63 | #endif | ||
64 | #endif | ||
65 | |||
66 | #ifndef FUNC_NAME | ||
67 | #define FUNC_NAME NGmemcpy | ||
68 | #endif | ||
69 | |||
70 | #ifndef PREAMBLE | ||
71 | #define PREAMBLE | ||
72 | #endif | ||
73 | |||
74 | #ifndef XCC | ||
75 | #define XCC xcc | ||
76 | #endif | ||
77 | |||
78 | .register %g2,#scratch | ||
79 | .register %g3,#scratch | ||
80 | |||
81 | .text | ||
82 | .align 64 | ||
83 | |||
84 | .globl FUNC_NAME | ||
85 | .type FUNC_NAME,#function | ||
86 | FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ | ||
87 | PREAMBLE | ||
88 | save %sp, -SAVE_AMOUNT, %sp | ||
89 | srlx %i2, 31, %g2 | ||
90 | cmp %g2, 0 | ||
91 | tne %xcc, 5 | ||
92 | mov %i0, %o0 | ||
93 | cmp %i2, 0 | ||
94 | be,pn %XCC, 85f | ||
95 | or %o0, %i1, %i3 | ||
96 | cmp %i2, 16 | ||
97 | blu,a,pn %XCC, 80f | ||
98 | or %i3, %i2, %i3 | ||
99 | |||
100 | /* 2 blocks (128 bytes) is the minimum we can do the block | ||
101 | * copy with. We need to ensure that we'll iterate at least | ||
102 | * once in the block copy loop. At worst we'll need to align | ||
103 | * the destination to a 64-byte boundary which can chew up | ||
104 | * to (64 - 1) bytes from the length before we perform the | ||
105 | * block copy loop. | ||
106 | */ | ||
107 | cmp %i2, (2 * 64) | ||
108 | blu,pt %XCC, 70f | ||
109 | andcc %i3, 0x7, %g0 | ||
110 | |||
111 | /* %o0: dst | ||
112 | * %i1: src | ||
113 | * %i2: len (known to be >= 128) | ||
114 | * | ||
115 | * The block copy loops will use %i4/%i5,%g2/%g3 as | ||
116 | * temporaries while copying the data. | ||
117 | */ | ||
118 | |||
119 | LOAD(prefetch, %i1, #one_read) | ||
120 | wr %g0, STORE_ASI, %asi | ||
121 | |||
122 | /* Align destination on 64-byte boundary. */ | ||
123 | andcc %o0, (64 - 1), %i4 | ||
124 | be,pt %XCC, 2f | ||
125 | sub %i4, 64, %i4 | ||
126 | sub %g0, %i4, %i4 ! bytes to align dst | ||
127 | sub %i2, %i4, %i2 | ||
128 | 1: subcc %i4, 1, %i4 | ||
129 | EX_LD(LOAD(ldub, %i1, %g1)) | ||
130 | EX_ST(STORE(stb, %g1, %o0)) | ||
131 | add %i1, 1, %i1 | ||
132 | bne,pt %XCC, 1b | ||
133 | add %o0, 1, %o0 | ||
134 | |||
135 | /* If the source is on a 16-byte boundary we can do | ||
136 | * the direct block copy loop. If it is 8-byte aligned | ||
137 | * we can do the 16-byte loads offset by -8 bytes and the | ||
138 | * init stores offset by one register. | ||
139 | * | ||
140 | * If the source is not even 8-byte aligned, we need to do | ||
141 | * shifting and masking (basically integer faligndata). | ||
142 | * | ||
143 | * The careful bit with init stores is that if we store | ||
144 | * to any part of the cache line we have to store the whole | ||
145 | * cacheline else we can end up with corrupt L2 cache line | ||
146 | * contents. Since the loop works on 64-bytes of 64-byte | ||
147 | * aligned store data at a time, this is easy to ensure. | ||
148 | */ | ||
149 | 2: | ||
150 | andcc %i1, (16 - 1), %i4 | ||
151 | andn %i2, (64 - 1), %g1 ! block copy loop iterator | ||
152 | be,pt %XCC, 50f | ||
153 | sub %i2, %g1, %i2 ! final sub-block copy bytes | ||
154 | |||
155 | cmp %i4, 8 | ||
156 | be,pt %XCC, 10f | ||
157 | sub %i1, %i4, %i1 | ||
158 | |||
159 | /* Neither 8-byte nor 16-byte aligned, shift and mask. */ | ||
160 | and %i4, 0x7, GLOBAL_SPARE | ||
161 | sll GLOBAL_SPARE, 3, GLOBAL_SPARE | ||
162 | mov 64, %i5 | ||
163 | EX_LD(LOAD_TWIN(%i1, %g2, %g3)) | ||
164 | sub %i5, GLOBAL_SPARE, %i5 | ||
165 | mov 16, %o4 | ||
166 | mov 32, %o5 | ||
167 | mov 48, %o7 | ||
168 | mov 64, %i3 | ||
169 | |||
170 | bg,pn %XCC, 9f | ||
171 | nop | ||
172 | |||
173 | #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ | ||
174 | sllx WORD1, POST_SHIFT, WORD1; \ | ||
175 | srlx WORD2, PRE_SHIFT, TMP; \ | ||
176 | sllx WORD2, POST_SHIFT, WORD2; \ | ||
177 | or WORD1, TMP, WORD1; \ | ||
178 | srlx WORD3, PRE_SHIFT, TMP; \ | ||
179 | or WORD2, TMP, WORD2; | ||
180 | |||
181 | 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) | ||
182 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) | ||
183 | LOAD(prefetch, %i1 + %i3, #one_read) | ||
184 | |||
185 | EX_ST(STORE_INIT(%g2, %o0 + 0x00)) | ||
186 | EX_ST(STORE_INIT(%g3, %o0 + 0x08)) | ||
187 | |||
188 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) | ||
189 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) | ||
190 | |||
191 | EX_ST(STORE_INIT(%o2, %o0 + 0x10)) | ||
192 | EX_ST(STORE_INIT(%o3, %o0 + 0x18)) | ||
193 | |||
194 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
195 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) | ||
196 | |||
197 | EX_ST(STORE_INIT(%g2, %o0 + 0x20)) | ||
198 | EX_ST(STORE_INIT(%g3, %o0 + 0x28)) | ||
199 | |||
200 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) | ||
201 | add %i1, 64, %i1 | ||
202 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) | ||
203 | |||
204 | EX_ST(STORE_INIT(%o2, %o0 + 0x30)) | ||
205 | EX_ST(STORE_INIT(%o3, %o0 + 0x38)) | ||
206 | |||
207 | subcc %g1, 64, %g1 | ||
208 | bne,pt %XCC, 8b | ||
209 | add %o0, 64, %o0 | ||
210 | |||
211 | ba,pt %XCC, 60f | ||
212 | add %i1, %i4, %i1 | ||
213 | |||
214 | 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3)) | ||
215 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) | ||
216 | LOAD(prefetch, %i1 + %i3, #one_read) | ||
217 | |||
218 | EX_ST(STORE_INIT(%g3, %o0 + 0x00)) | ||
219 | EX_ST(STORE_INIT(%o2, %o0 + 0x08)) | ||
220 | |||
221 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3)) | ||
222 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) | ||
223 | |||
224 | EX_ST(STORE_INIT(%o3, %o0 + 0x10)) | ||
225 | EX_ST(STORE_INIT(%g2, %o0 + 0x18)) | ||
226 | |||
227 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
228 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) | ||
229 | |||
230 | EX_ST(STORE_INIT(%g3, %o0 + 0x20)) | ||
231 | EX_ST(STORE_INIT(%o2, %o0 + 0x28)) | ||
232 | |||
233 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3)) | ||
234 | add %i1, 64, %i1 | ||
235 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) | ||
236 | |||
237 | EX_ST(STORE_INIT(%o3, %o0 + 0x30)) | ||
238 | EX_ST(STORE_INIT(%g2, %o0 + 0x38)) | ||
239 | |||
240 | subcc %g1, 64, %g1 | ||
241 | bne,pt %XCC, 9b | ||
242 | add %o0, 64, %o0 | ||
243 | |||
244 | ba,pt %XCC, 60f | ||
245 | add %i1, %i4, %i1 | ||
246 | |||
247 | 10: /* Destination is 64-byte aligned, source was only 8-byte | ||
248 | * aligned but it has been subtracted by 8 and we perform | ||
249 | * one twin load ahead, then add 8 back into source when | ||
250 | * we finish the loop. | ||
251 | */ | ||
252 | EX_LD(LOAD_TWIN(%i1, %o4, %o5)) | ||
253 | mov 16, %o7 | ||
254 | mov 32, %g2 | ||
255 | mov 48, %g3 | ||
256 | mov 64, %o1 | ||
257 | 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
258 | LOAD(prefetch, %i1 + %o1, #one_read) | ||
259 | EX_ST(STORE_INIT(%o5, %o0 + 0x00)) ! initializes cache line | ||
260 | EX_ST(STORE_INIT(%o2, %o0 + 0x08)) | ||
261 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) | ||
262 | EX_ST(STORE_INIT(%o3, %o0 + 0x10)) | ||
263 | EX_ST(STORE_INIT(%o4, %o0 + 0x18)) | ||
264 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) | ||
265 | EX_ST(STORE_INIT(%o5, %o0 + 0x20)) | ||
266 | EX_ST(STORE_INIT(%o2, %o0 + 0x28)) | ||
267 | EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5)) | ||
268 | add %i1, 64, %i1 | ||
269 | EX_ST(STORE_INIT(%o3, %o0 + 0x30)) | ||
270 | EX_ST(STORE_INIT(%o4, %o0 + 0x38)) | ||
271 | subcc %g1, 64, %g1 | ||
272 | bne,pt %XCC, 1b | ||
273 | add %o0, 64, %o0 | ||
274 | |||
275 | ba,pt %XCC, 60f | ||
276 | add %i1, 0x8, %i1 | ||
277 | |||
278 | 50: /* Destination is 64-byte aligned, and source is 16-byte | ||
279 | * aligned. | ||
280 | */ | ||
281 | mov 16, %o7 | ||
282 | mov 32, %g2 | ||
283 | mov 48, %g3 | ||
284 | mov 64, %o1 | ||
285 | 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5)) | ||
286 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3)) | ||
287 | LOAD(prefetch, %i1 + %o1, #one_read) | ||
288 | EX_ST(STORE_INIT(%o4, %o0 + 0x00)) ! initializes cache line | ||
289 | EX_ST(STORE_INIT(%o5, %o0 + 0x08)) | ||
290 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5)) | ||
291 | EX_ST(STORE_INIT(%o2, %o0 + 0x10)) | ||
292 | EX_ST(STORE_INIT(%o3, %o0 + 0x18)) | ||
293 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3)) | ||
294 | add %i1, 64, %i1 | ||
295 | EX_ST(STORE_INIT(%o4, %o0 + 0x20)) | ||
296 | EX_ST(STORE_INIT(%o5, %o0 + 0x28)) | ||
297 | EX_ST(STORE_INIT(%o2, %o0 + 0x30)) | ||
298 | EX_ST(STORE_INIT(%o3, %o0 + 0x38)) | ||
299 | subcc %g1, 64, %g1 | ||
300 | bne,pt %XCC, 1b | ||
301 | add %o0, 64, %o0 | ||
302 | /* fall through */ | ||
303 | |||
304 | 60: | ||
305 | membar #Sync | ||
306 | |||
307 | /* %i2 contains any final bytes still needed to be copied | ||
308 | * over. If anything is left, we copy it one byte at a time. | ||
309 | */ | ||
310 | RESTORE_ASI(%i3) | ||
311 | brz,pt %i2, 85f | ||
312 | sub %o0, %i1, %i3 | ||
313 | ba,a,pt %XCC, 90f | ||
314 | |||
315 | .align 64 | ||
316 | 70: /* 16 < len <= 64 */ | ||
317 | bne,pn %XCC, 75f | ||
318 | sub %o0, %i1, %i3 | ||
319 | |||
320 | 72: | ||
321 | andn %i2, 0xf, %i4 | ||
322 | and %i2, 0xf, %i2 | ||
323 | 1: subcc %i4, 0x10, %i4 | ||
324 | EX_LD(LOAD(ldx, %i1, %o4)) | ||
325 | add %i1, 0x08, %i1 | ||
326 | EX_LD(LOAD(ldx, %i1, %g1)) | ||
327 | sub %i1, 0x08, %i1 | ||
328 | EX_ST(STORE(stx, %o4, %i1 + %i3)) | ||
329 | add %i1, 0x8, %i1 | ||
330 | EX_ST(STORE(stx, %g1, %i1 + %i3)) | ||
331 | bgu,pt %XCC, 1b | ||
332 | add %i1, 0x8, %i1 | ||
333 | 73: andcc %i2, 0x8, %g0 | ||
334 | be,pt %XCC, 1f | ||
335 | nop | ||
336 | sub %i2, 0x8, %i2 | ||
337 | EX_LD(LOAD(ldx, %i1, %o4)) | ||
338 | EX_ST(STORE(stx, %o4, %i1 + %i3)) | ||
339 | add %i1, 0x8, %i1 | ||
340 | 1: andcc %i2, 0x4, %g0 | ||
341 | be,pt %XCC, 1f | ||
342 | nop | ||
343 | sub %i2, 0x4, %i2 | ||
344 | EX_LD(LOAD(lduw, %i1, %i5)) | ||
345 | EX_ST(STORE(stw, %i5, %i1 + %i3)) | ||
346 | add %i1, 0x4, %i1 | ||
347 | 1: cmp %i2, 0 | ||
348 | be,pt %XCC, 85f | ||
349 | nop | ||
350 | ba,pt %xcc, 90f | ||
351 | nop | ||
352 | |||
353 | 75: | ||
354 | andcc %o0, 0x7, %g1 | ||
355 | sub %g1, 0x8, %g1 | ||
356 | be,pn %icc, 2f | ||
357 | sub %g0, %g1, %g1 | ||
358 | sub %i2, %g1, %i2 | ||
359 | |||
360 | 1: subcc %g1, 1, %g1 | ||
361 | EX_LD(LOAD(ldub, %i1, %i5)) | ||
362 | EX_ST(STORE(stb, %i5, %i1 + %i3)) | ||
363 | bgu,pt %icc, 1b | ||
364 | add %i1, 1, %i1 | ||
365 | |||
366 | 2: add %i1, %i3, %o0 | ||
367 | andcc %i1, 0x7, %g1 | ||
368 | bne,pt %icc, 8f | ||
369 | sll %g1, 3, %g1 | ||
370 | |||
371 | cmp %i2, 16 | ||
372 | bgeu,pt %icc, 72b | ||
373 | nop | ||
374 | ba,a,pt %xcc, 73b | ||
375 | |||
376 | 8: mov 64, %i3 | ||
377 | andn %i1, 0x7, %i1 | ||
378 | EX_LD(LOAD(ldx, %i1, %g2)) | ||
379 | sub %i3, %g1, %i3 | ||
380 | andn %i2, 0x7, %i4 | ||
381 | sllx %g2, %g1, %g2 | ||
382 | 1: add %i1, 0x8, %i1 | ||
383 | EX_LD(LOAD(ldx, %i1, %g3)) | ||
384 | subcc %i4, 0x8, %i4 | ||
385 | srlx %g3, %i3, %i5 | ||
386 | or %i5, %g2, %i5 | ||
387 | EX_ST(STORE(stx, %i5, %o0)) | ||
388 | add %o0, 0x8, %o0 | ||
389 | bgu,pt %icc, 1b | ||
390 | sllx %g3, %g1, %g2 | ||
391 | |||
392 | srl %g1, 3, %g1 | ||
393 | andcc %i2, 0x7, %i2 | ||
394 | be,pn %icc, 85f | ||
395 | add %i1, %g1, %i1 | ||
396 | ba,pt %xcc, 90f | ||
397 | sub %o0, %i1, %i3 | ||
398 | |||
399 | .align 64 | ||
400 | 80: /* 0 < len <= 16 */ | ||
401 | andcc %i3, 0x3, %g0 | ||
402 | bne,pn %XCC, 90f | ||
403 | sub %o0, %i1, %i3 | ||
404 | |||
405 | 1: | ||
406 | subcc %i2, 4, %i2 | ||
407 | EX_LD(LOAD(lduw, %i1, %g1)) | ||
408 | EX_ST(STORE(stw, %g1, %i1 + %i3)) | ||
409 | bgu,pt %XCC, 1b | ||
410 | add %i1, 4, %i1 | ||
411 | |||
412 | 85: ret | ||
413 | restore EX_RETVAL(%i0), %g0, %o0 | ||
414 | |||
415 | .align 32 | ||
416 | 90: | ||
417 | subcc %i2, 1, %i2 | ||
418 | EX_LD(LOAD(ldub, %i1, %g1)) | ||
419 | EX_ST(STORE(stb, %g1, %i1 + %i3)) | ||
420 | bgu,pt %XCC, 90b | ||
421 | add %i1, 1, %i1 | ||
422 | ret | ||
423 | restore EX_RETVAL(%i0), %g0, %o0 | ||
424 | |||
425 | .size FUNC_NAME, .-FUNC_NAME | ||