diff options
Diffstat (limited to 'arch/sparc/lib/U1memcpy.S')
-rw-r--r-- | arch/sparc/lib/U1memcpy.S | 563 |
1 files changed, 563 insertions, 0 deletions
diff --git a/arch/sparc/lib/U1memcpy.S b/arch/sparc/lib/U1memcpy.S new file mode 100644 index 000000000000..bafd2fc07acb --- /dev/null +++ b/arch/sparc/lib/U1memcpy.S | |||
@@ -0,0 +1,563 @@ | |||
1 | /* U1memcpy.S: UltraSPARC-I/II/IIi/IIe optimized memcpy. | ||
2 | * | ||
3 | * Copyright (C) 1997, 2004 David S. Miller (davem@redhat.com) | ||
4 | * Copyright (C) 1996, 1997, 1998, 1999 Jakub Jelinek (jj@ultra.linux.cz) | ||
5 | */ | ||
6 | |||
7 | #ifdef __KERNEL__ | ||
8 | #include <asm/visasm.h> | ||
9 | #include <asm/asi.h> | ||
10 | #define GLOBAL_SPARE g7 | ||
11 | #else | ||
12 | #define GLOBAL_SPARE g5 | ||
13 | #define ASI_BLK_P 0xf0 | ||
14 | #define FPRS_FEF 0x04 | ||
15 | #ifdef MEMCPY_DEBUG | ||
16 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \ | ||
17 | clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0; | ||
18 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
19 | #else | ||
20 | #define VISEntry rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs | ||
21 | #define VISExit and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs | ||
22 | #endif | ||
23 | #endif | ||
24 | |||
25 | #ifndef EX_LD | ||
26 | #define EX_LD(x) x | ||
27 | #endif | ||
28 | |||
29 | #ifndef EX_ST | ||
30 | #define EX_ST(x) x | ||
31 | #endif | ||
32 | |||
33 | #ifndef EX_RETVAL | ||
34 | #define EX_RETVAL(x) x | ||
35 | #endif | ||
36 | |||
37 | #ifndef LOAD | ||
38 | #define LOAD(type,addr,dest) type [addr], dest | ||
39 | #endif | ||
40 | |||
41 | #ifndef LOAD_BLK | ||
42 | #define LOAD_BLK(addr,dest) ldda [addr] ASI_BLK_P, dest | ||
43 | #endif | ||
44 | |||
45 | #ifndef STORE | ||
46 | #define STORE(type,src,addr) type src, [addr] | ||
47 | #endif | ||
48 | |||
49 | #ifndef STORE_BLK | ||
50 | #define STORE_BLK(src,addr) stda src, [addr] ASI_BLK_P | ||
51 | #endif | ||
52 | |||
53 | #ifndef FUNC_NAME | ||
54 | #define FUNC_NAME memcpy | ||
55 | #endif | ||
56 | |||
57 | #ifndef PREAMBLE | ||
58 | #define PREAMBLE | ||
59 | #endif | ||
60 | |||
61 | #ifndef XCC | ||
62 | #define XCC xcc | ||
63 | #endif | ||
64 | |||
65 | #define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ | ||
66 | faligndata %f1, %f2, %f48; \ | ||
67 | faligndata %f2, %f3, %f50; \ | ||
68 | faligndata %f3, %f4, %f52; \ | ||
69 | faligndata %f4, %f5, %f54; \ | ||
70 | faligndata %f5, %f6, %f56; \ | ||
71 | faligndata %f6, %f7, %f58; \ | ||
72 | faligndata %f7, %f8, %f60; \ | ||
73 | faligndata %f8, %f9, %f62; | ||
74 | |||
75 | #define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ | ||
76 | EX_LD(LOAD_BLK(%src, %fdest)); \ | ||
77 | EX_ST(STORE_BLK(%fsrc, %dest)); \ | ||
78 | add %src, 0x40, %src; \ | ||
79 | subcc %len, 0x40, %len; \ | ||
80 | be,pn %xcc, jmptgt; \ | ||
81 | add %dest, 0x40, %dest; \ | ||
82 | |||
83 | #define LOOP_CHUNK1(src, dest, len, branch_dest) \ | ||
84 | MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest) | ||
85 | #define LOOP_CHUNK2(src, dest, len, branch_dest) \ | ||
86 | MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest) | ||
87 | #define LOOP_CHUNK3(src, dest, len, branch_dest) \ | ||
88 | MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest) | ||
89 | |||
90 | #define DO_SYNC membar #Sync; | ||
91 | #define STORE_SYNC(dest, fsrc) \ | ||
92 | EX_ST(STORE_BLK(%fsrc, %dest)); \ | ||
93 | add %dest, 0x40, %dest; \ | ||
94 | DO_SYNC | ||
95 | |||
96 | #define STORE_JUMP(dest, fsrc, target) \ | ||
97 | EX_ST(STORE_BLK(%fsrc, %dest)); \ | ||
98 | add %dest, 0x40, %dest; \ | ||
99 | ba,pt %xcc, target; \ | ||
100 | nop; | ||
101 | |||
102 | #define FINISH_VISCHUNK(dest, f0, f1, left) \ | ||
103 | subcc %left, 8, %left;\ | ||
104 | bl,pn %xcc, 95f; \ | ||
105 | faligndata %f0, %f1, %f48; \ | ||
106 | EX_ST(STORE(std, %f48, %dest)); \ | ||
107 | add %dest, 8, %dest; | ||
108 | |||
109 | #define UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ | ||
110 | subcc %left, 8, %left; \ | ||
111 | bl,pn %xcc, 95f; \ | ||
112 | fsrc1 %f0, %f1; | ||
113 | |||
114 | #define UNEVEN_VISCHUNK(dest, f0, f1, left) \ | ||
115 | UNEVEN_VISCHUNK_LAST(dest, f0, f1, left) \ | ||
116 | ba,a,pt %xcc, 93f; | ||
117 | |||
118 | .register %g2,#scratch | ||
119 | .register %g3,#scratch | ||
120 | |||
121 | .text | ||
122 | .align 64 | ||
123 | |||
124 | .globl FUNC_NAME | ||
125 | .type FUNC_NAME,#function | ||
126 | FUNC_NAME: /* %o0=dst, %o1=src, %o2=len */ | ||
127 | srlx %o2, 31, %g2 | ||
128 | cmp %g2, 0 | ||
129 | tne %xcc, 5 | ||
130 | PREAMBLE | ||
131 | mov %o0, %o4 | ||
132 | cmp %o2, 0 | ||
133 | be,pn %XCC, 85f | ||
134 | or %o0, %o1, %o3 | ||
135 | cmp %o2, 16 | ||
136 | blu,a,pn %XCC, 80f | ||
137 | or %o3, %o2, %o3 | ||
138 | |||
139 | cmp %o2, (5 * 64) | ||
140 | blu,pt %XCC, 70f | ||
141 | andcc %o3, 0x7, %g0 | ||
142 | |||
143 | /* Clobbers o5/g1/g2/g3/g7/icc/xcc. */ | ||
144 | VISEntry | ||
145 | |||
146 | /* Is 'dst' already aligned on an 64-byte boundary? */ | ||
147 | andcc %o0, 0x3f, %g2 | ||
148 | be,pt %XCC, 2f | ||
149 | |||
150 | /* Compute abs((dst & 0x3f) - 0x40) into %g2. This is the number | ||
151 | * of bytes to copy to make 'dst' 64-byte aligned. We pre- | ||
152 | * subtract this from 'len'. | ||
153 | */ | ||
154 | sub %o0, %o1, %GLOBAL_SPARE | ||
155 | sub %g2, 0x40, %g2 | ||
156 | sub %g0, %g2, %g2 | ||
157 | sub %o2, %g2, %o2 | ||
158 | andcc %g2, 0x7, %g1 | ||
159 | be,pt %icc, 2f | ||
160 | and %g2, 0x38, %g2 | ||
161 | |||
162 | 1: subcc %g1, 0x1, %g1 | ||
163 | EX_LD(LOAD(ldub, %o1 + 0x00, %o3)) | ||
164 | EX_ST(STORE(stb, %o3, %o1 + %GLOBAL_SPARE)) | ||
165 | bgu,pt %XCC, 1b | ||
166 | add %o1, 0x1, %o1 | ||
167 | |||
168 | add %o1, %GLOBAL_SPARE, %o0 | ||
169 | |||
170 | 2: cmp %g2, 0x0 | ||
171 | and %o1, 0x7, %g1 | ||
172 | be,pt %icc, 3f | ||
173 | alignaddr %o1, %g0, %o1 | ||
174 | |||
175 | EX_LD(LOAD(ldd, %o1, %f4)) | ||
176 | 1: EX_LD(LOAD(ldd, %o1 + 0x8, %f6)) | ||
177 | add %o1, 0x8, %o1 | ||
178 | subcc %g2, 0x8, %g2 | ||
179 | faligndata %f4, %f6, %f0 | ||
180 | EX_ST(STORE(std, %f0, %o0)) | ||
181 | be,pn %icc, 3f | ||
182 | add %o0, 0x8, %o0 | ||
183 | |||
184 | EX_LD(LOAD(ldd, %o1 + 0x8, %f4)) | ||
185 | add %o1, 0x8, %o1 | ||
186 | subcc %g2, 0x8, %g2 | ||
187 | faligndata %f6, %f4, %f0 | ||
188 | EX_ST(STORE(std, %f0, %o0)) | ||
189 | bne,pt %icc, 1b | ||
190 | add %o0, 0x8, %o0 | ||
191 | |||
192 | /* Destination is 64-byte aligned. */ | ||
193 | 3: | ||
194 | membar #LoadStore | #StoreStore | #StoreLoad | ||
195 | |||
196 | subcc %o2, 0x40, %GLOBAL_SPARE | ||
197 | add %o1, %g1, %g1 | ||
198 | andncc %GLOBAL_SPARE, (0x40 - 1), %GLOBAL_SPARE | ||
199 | srl %g1, 3, %g2 | ||
200 | sub %o2, %GLOBAL_SPARE, %g3 | ||
201 | andn %o1, (0x40 - 1), %o1 | ||
202 | and %g2, 7, %g2 | ||
203 | andncc %g3, 0x7, %g3 | ||
204 | fmovd %f0, %f2 | ||
205 | sub %g3, 0x8, %g3 | ||
206 | sub %o2, %GLOBAL_SPARE, %o2 | ||
207 | |||
208 | add %g1, %GLOBAL_SPARE, %g1 | ||
209 | subcc %o2, %g3, %o2 | ||
210 | |||
211 | EX_LD(LOAD_BLK(%o1, %f0)) | ||
212 | add %o1, 0x40, %o1 | ||
213 | add %g1, %g3, %g1 | ||
214 | EX_LD(LOAD_BLK(%o1, %f16)) | ||
215 | add %o1, 0x40, %o1 | ||
216 | sub %GLOBAL_SPARE, 0x80, %GLOBAL_SPARE | ||
217 | EX_LD(LOAD_BLK(%o1, %f32)) | ||
218 | add %o1, 0x40, %o1 | ||
219 | |||
220 | /* There are 8 instances of the unrolled loop, | ||
221 | * one for each possible alignment of the | ||
222 | * source buffer. Each loop instance is 452 | ||
223 | * bytes. | ||
224 | */ | ||
225 | sll %g2, 3, %o3 | ||
226 | sub %o3, %g2, %o3 | ||
227 | sllx %o3, 4, %o3 | ||
228 | add %o3, %g2, %o3 | ||
229 | sllx %o3, 2, %g2 | ||
230 | 1: rd %pc, %o3 | ||
231 | add %o3, %lo(1f - 1b), %o3 | ||
232 | jmpl %o3 + %g2, %g0 | ||
233 | nop | ||
234 | |||
235 | .align 64 | ||
236 | 1: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) | ||
237 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
238 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | ||
239 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
240 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) | ||
241 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
242 | ba,pt %xcc, 1b+4 | ||
243 | faligndata %f0, %f2, %f48 | ||
244 | 1: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | ||
245 | STORE_SYNC(o0, f48) | ||
246 | FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) | ||
247 | STORE_JUMP(o0, f48, 40f) | ||
248 | 2: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) | ||
249 | STORE_SYNC(o0, f48) | ||
250 | FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) | ||
251 | STORE_JUMP(o0, f48, 48f) | ||
252 | 3: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) | ||
253 | STORE_SYNC(o0, f48) | ||
254 | FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) | ||
255 | STORE_JUMP(o0, f48, 56f) | ||
256 | |||
257 | 1: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) | ||
258 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
259 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | ||
260 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
261 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) | ||
262 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
263 | ba,pt %xcc, 1b+4 | ||
264 | faligndata %f2, %f4, %f48 | ||
265 | 1: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | ||
266 | STORE_SYNC(o0, f48) | ||
267 | FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) | ||
268 | STORE_JUMP(o0, f48, 41f) | ||
269 | 2: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) | ||
270 | STORE_SYNC(o0, f48) | ||
271 | FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) | ||
272 | STORE_JUMP(o0, f48, 49f) | ||
273 | 3: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) | ||
274 | STORE_SYNC(o0, f48) | ||
275 | FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) | ||
276 | STORE_JUMP(o0, f48, 57f) | ||
277 | |||
278 | 1: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) | ||
279 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
280 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | ||
281 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
282 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) | ||
283 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
284 | ba,pt %xcc, 1b+4 | ||
285 | faligndata %f4, %f6, %f48 | ||
286 | 1: FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | ||
287 | STORE_SYNC(o0, f48) | ||
288 | FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) | ||
289 | STORE_JUMP(o0, f48, 42f) | ||
290 | 2: FREG_FROB(f36,f38,f40,f42,f44,f46,f0, f2, f4) | ||
291 | STORE_SYNC(o0, f48) | ||
292 | FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) | ||
293 | STORE_JUMP(o0, f48, 50f) | ||
294 | 3: FREG_FROB(f4, f6, f8, f10,f12,f14,f16,f18,f20) | ||
295 | STORE_SYNC(o0, f48) | ||
296 | FREG_FROB(f20,f22,f24,f26,f28,f30,f32,f34,f36) | ||
297 | STORE_JUMP(o0, f48, 58f) | ||
298 | |||
299 | 1: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) | ||
300 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
301 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | ||
302 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
303 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) | ||
304 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
305 | ba,pt %xcc, 1b+4 | ||
306 | faligndata %f6, %f8, %f48 | ||
307 | 1: FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | ||
308 | STORE_SYNC(o0, f48) | ||
309 | FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) | ||
310 | STORE_JUMP(o0, f48, 43f) | ||
311 | 2: FREG_FROB(f38,f40,f42,f44,f46,f0, f2, f4, f6) | ||
312 | STORE_SYNC(o0, f48) | ||
313 | FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) | ||
314 | STORE_JUMP(o0, f48, 51f) | ||
315 | 3: FREG_FROB(f6, f8, f10,f12,f14,f16,f18,f20,f22) | ||
316 | STORE_SYNC(o0, f48) | ||
317 | FREG_FROB(f22,f24,f26,f28,f30,f32,f34,f36,f38) | ||
318 | STORE_JUMP(o0, f48, 59f) | ||
319 | |||
320 | 1: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) | ||
321 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
322 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | ||
323 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
324 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) | ||
325 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
326 | ba,pt %xcc, 1b+4 | ||
327 | faligndata %f8, %f10, %f48 | ||
328 | 1: FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | ||
329 | STORE_SYNC(o0, f48) | ||
330 | FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) | ||
331 | STORE_JUMP(o0, f48, 44f) | ||
332 | 2: FREG_FROB(f40,f42,f44,f46,f0, f2, f4, f6, f8) | ||
333 | STORE_SYNC(o0, f48) | ||
334 | FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) | ||
335 | STORE_JUMP(o0, f48, 52f) | ||
336 | 3: FREG_FROB(f8, f10,f12,f14,f16,f18,f20,f22,f24) | ||
337 | STORE_SYNC(o0, f48) | ||
338 | FREG_FROB(f24,f26,f28,f30,f32,f34,f36,f38,f40) | ||
339 | STORE_JUMP(o0, f48, 60f) | ||
340 | |||
341 | 1: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) | ||
342 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
343 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | ||
344 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
345 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) | ||
346 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
347 | ba,pt %xcc, 1b+4 | ||
348 | faligndata %f10, %f12, %f48 | ||
349 | 1: FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | ||
350 | STORE_SYNC(o0, f48) | ||
351 | FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) | ||
352 | STORE_JUMP(o0, f48, 45f) | ||
353 | 2: FREG_FROB(f42,f44,f46,f0, f2, f4, f6, f8, f10) | ||
354 | STORE_SYNC(o0, f48) | ||
355 | FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) | ||
356 | STORE_JUMP(o0, f48, 53f) | ||
357 | 3: FREG_FROB(f10,f12,f14,f16,f18,f20,f22,f24,f26) | ||
358 | STORE_SYNC(o0, f48) | ||
359 | FREG_FROB(f26,f28,f30,f32,f34,f36,f38,f40,f42) | ||
360 | STORE_JUMP(o0, f48, 61f) | ||
361 | |||
362 | 1: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) | ||
363 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
364 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | ||
365 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
366 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) | ||
367 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
368 | ba,pt %xcc, 1b+4 | ||
369 | faligndata %f12, %f14, %f48 | ||
370 | 1: FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | ||
371 | STORE_SYNC(o0, f48) | ||
372 | FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) | ||
373 | STORE_JUMP(o0, f48, 46f) | ||
374 | 2: FREG_FROB(f44,f46,f0, f2, f4, f6, f8, f10,f12) | ||
375 | STORE_SYNC(o0, f48) | ||
376 | FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) | ||
377 | STORE_JUMP(o0, f48, 54f) | ||
378 | 3: FREG_FROB(f12,f14,f16,f18,f20,f22,f24,f26,f28) | ||
379 | STORE_SYNC(o0, f48) | ||
380 | FREG_FROB(f28,f30,f32,f34,f36,f38,f40,f42,f44) | ||
381 | STORE_JUMP(o0, f48, 62f) | ||
382 | |||
383 | 1: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) | ||
384 | LOOP_CHUNK1(o1, o0, GLOBAL_SPARE, 1f) | ||
385 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | ||
386 | LOOP_CHUNK2(o1, o0, GLOBAL_SPARE, 2f) | ||
387 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) | ||
388 | LOOP_CHUNK3(o1, o0, GLOBAL_SPARE, 3f) | ||
389 | ba,pt %xcc, 1b+4 | ||
390 | faligndata %f14, %f16, %f48 | ||
391 | 1: FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | ||
392 | STORE_SYNC(o0, f48) | ||
393 | FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) | ||
394 | STORE_JUMP(o0, f48, 47f) | ||
395 | 2: FREG_FROB(f46,f0, f2, f4, f6, f8, f10,f12,f14) | ||
396 | STORE_SYNC(o0, f48) | ||
397 | FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) | ||
398 | STORE_JUMP(o0, f48, 55f) | ||
399 | 3: FREG_FROB(f14,f16,f18,f20,f22,f24,f26,f28,f30) | ||
400 | STORE_SYNC(o0, f48) | ||
401 | FREG_FROB(f30,f32,f34,f36,f38,f40,f42,f44,f46) | ||
402 | STORE_JUMP(o0, f48, 63f) | ||
403 | |||
404 | 40: FINISH_VISCHUNK(o0, f0, f2, g3) | ||
405 | 41: FINISH_VISCHUNK(o0, f2, f4, g3) | ||
406 | 42: FINISH_VISCHUNK(o0, f4, f6, g3) | ||
407 | 43: FINISH_VISCHUNK(o0, f6, f8, g3) | ||
408 | 44: FINISH_VISCHUNK(o0, f8, f10, g3) | ||
409 | 45: FINISH_VISCHUNK(o0, f10, f12, g3) | ||
410 | 46: FINISH_VISCHUNK(o0, f12, f14, g3) | ||
411 | 47: UNEVEN_VISCHUNK(o0, f14, f0, g3) | ||
412 | 48: FINISH_VISCHUNK(o0, f16, f18, g3) | ||
413 | 49: FINISH_VISCHUNK(o0, f18, f20, g3) | ||
414 | 50: FINISH_VISCHUNK(o0, f20, f22, g3) | ||
415 | 51: FINISH_VISCHUNK(o0, f22, f24, g3) | ||
416 | 52: FINISH_VISCHUNK(o0, f24, f26, g3) | ||
417 | 53: FINISH_VISCHUNK(o0, f26, f28, g3) | ||
418 | 54: FINISH_VISCHUNK(o0, f28, f30, g3) | ||
419 | 55: UNEVEN_VISCHUNK(o0, f30, f0, g3) | ||
420 | 56: FINISH_VISCHUNK(o0, f32, f34, g3) | ||
421 | 57: FINISH_VISCHUNK(o0, f34, f36, g3) | ||
422 | 58: FINISH_VISCHUNK(o0, f36, f38, g3) | ||
423 | 59: FINISH_VISCHUNK(o0, f38, f40, g3) | ||
424 | 60: FINISH_VISCHUNK(o0, f40, f42, g3) | ||
425 | 61: FINISH_VISCHUNK(o0, f42, f44, g3) | ||
426 | 62: FINISH_VISCHUNK(o0, f44, f46, g3) | ||
427 | 63: UNEVEN_VISCHUNK_LAST(o0, f46, f0, g3) | ||
428 | |||
429 | 93: EX_LD(LOAD(ldd, %o1, %f2)) | ||
430 | add %o1, 8, %o1 | ||
431 | subcc %g3, 8, %g3 | ||
432 | faligndata %f0, %f2, %f8 | ||
433 | EX_ST(STORE(std, %f8, %o0)) | ||
434 | bl,pn %xcc, 95f | ||
435 | add %o0, 8, %o0 | ||
436 | EX_LD(LOAD(ldd, %o1, %f0)) | ||
437 | add %o1, 8, %o1 | ||
438 | subcc %g3, 8, %g3 | ||
439 | faligndata %f2, %f0, %f8 | ||
440 | EX_ST(STORE(std, %f8, %o0)) | ||
441 | bge,pt %xcc, 93b | ||
442 | add %o0, 8, %o0 | ||
443 | |||
444 | 95: brz,pt %o2, 2f | ||
445 | mov %g1, %o1 | ||
446 | |||
447 | 1: EX_LD(LOAD(ldub, %o1, %o3)) | ||
448 | add %o1, 1, %o1 | ||
449 | subcc %o2, 1, %o2 | ||
450 | EX_ST(STORE(stb, %o3, %o0)) | ||
451 | bne,pt %xcc, 1b | ||
452 | add %o0, 1, %o0 | ||
453 | |||
454 | 2: membar #StoreLoad | #StoreStore | ||
455 | VISExit | ||
456 | retl | ||
457 | mov EX_RETVAL(%o4), %o0 | ||
458 | |||
459 | .align 64 | ||
460 | 70: /* 16 < len <= (5 * 64) */ | ||
461 | bne,pn %XCC, 75f | ||
462 | sub %o0, %o1, %o3 | ||
463 | |||
464 | 72: andn %o2, 0xf, %GLOBAL_SPARE | ||
465 | and %o2, 0xf, %o2 | ||
466 | 1: EX_LD(LOAD(ldx, %o1 + 0x00, %o5)) | ||
467 | EX_LD(LOAD(ldx, %o1 + 0x08, %g1)) | ||
468 | subcc %GLOBAL_SPARE, 0x10, %GLOBAL_SPARE | ||
469 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
470 | add %o1, 0x8, %o1 | ||
471 | EX_ST(STORE(stx, %g1, %o1 + %o3)) | ||
472 | bgu,pt %XCC, 1b | ||
473 | add %o1, 0x8, %o1 | ||
474 | 73: andcc %o2, 0x8, %g0 | ||
475 | be,pt %XCC, 1f | ||
476 | nop | ||
477 | EX_LD(LOAD(ldx, %o1, %o5)) | ||
478 | sub %o2, 0x8, %o2 | ||
479 | EX_ST(STORE(stx, %o5, %o1 + %o3)) | ||
480 | add %o1, 0x8, %o1 | ||
481 | 1: andcc %o2, 0x4, %g0 | ||
482 | be,pt %XCC, 1f | ||
483 | nop | ||
484 | EX_LD(LOAD(lduw, %o1, %o5)) | ||
485 | sub %o2, 0x4, %o2 | ||
486 | EX_ST(STORE(stw, %o5, %o1 + %o3)) | ||
487 | add %o1, 0x4, %o1 | ||
488 | 1: cmp %o2, 0 | ||
489 | be,pt %XCC, 85f | ||
490 | nop | ||
491 | ba,pt %xcc, 90f | ||
492 | nop | ||
493 | |||
494 | 75: andcc %o0, 0x7, %g1 | ||
495 | sub %g1, 0x8, %g1 | ||
496 | be,pn %icc, 2f | ||
497 | sub %g0, %g1, %g1 | ||
498 | sub %o2, %g1, %o2 | ||
499 | |||
500 | 1: EX_LD(LOAD(ldub, %o1, %o5)) | ||
501 | subcc %g1, 1, %g1 | ||
502 | EX_ST(STORE(stb, %o5, %o1 + %o3)) | ||
503 | bgu,pt %icc, 1b | ||
504 | add %o1, 1, %o1 | ||
505 | |||
506 | 2: add %o1, %o3, %o0 | ||
507 | andcc %o1, 0x7, %g1 | ||
508 | bne,pt %icc, 8f | ||
509 | sll %g1, 3, %g1 | ||
510 | |||
511 | cmp %o2, 16 | ||
512 | bgeu,pt %icc, 72b | ||
513 | nop | ||
514 | ba,a,pt %xcc, 73b | ||
515 | |||
516 | 8: mov 64, %o3 | ||
517 | andn %o1, 0x7, %o1 | ||
518 | EX_LD(LOAD(ldx, %o1, %g2)) | ||
519 | sub %o3, %g1, %o3 | ||
520 | andn %o2, 0x7, %GLOBAL_SPARE | ||
521 | sllx %g2, %g1, %g2 | ||
522 | 1: EX_LD(LOAD(ldx, %o1 + 0x8, %g3)) | ||
523 | subcc %GLOBAL_SPARE, 0x8, %GLOBAL_SPARE | ||
524 | add %o1, 0x8, %o1 | ||
525 | srlx %g3, %o3, %o5 | ||
526 | or %o5, %g2, %o5 | ||
527 | EX_ST(STORE(stx, %o5, %o0)) | ||
528 | add %o0, 0x8, %o0 | ||
529 | bgu,pt %icc, 1b | ||
530 | sllx %g3, %g1, %g2 | ||
531 | |||
532 | srl %g1, 3, %g1 | ||
533 | andcc %o2, 0x7, %o2 | ||
534 | be,pn %icc, 85f | ||
535 | add %o1, %g1, %o1 | ||
536 | ba,pt %xcc, 90f | ||
537 | sub %o0, %o1, %o3 | ||
538 | |||
539 | .align 64 | ||
540 | 80: /* 0 < len <= 16 */ | ||
541 | andcc %o3, 0x3, %g0 | ||
542 | bne,pn %XCC, 90f | ||
543 | sub %o0, %o1, %o3 | ||
544 | |||
545 | 1: EX_LD(LOAD(lduw, %o1, %g1)) | ||
546 | subcc %o2, 4, %o2 | ||
547 | EX_ST(STORE(stw, %g1, %o1 + %o3)) | ||
548 | bgu,pt %XCC, 1b | ||
549 | add %o1, 4, %o1 | ||
550 | |||
551 | 85: retl | ||
552 | mov EX_RETVAL(%o4), %o0 | ||
553 | |||
554 | .align 32 | ||
555 | 90: EX_LD(LOAD(ldub, %o1, %g1)) | ||
556 | subcc %o2, 1, %o2 | ||
557 | EX_ST(STORE(stb, %g1, %o1 + %o3)) | ||
558 | bgu,pt %XCC, 90b | ||
559 | add %o1, 1, %o1 | ||
560 | retl | ||
561 | mov EX_RETVAL(%o4), %o0 | ||
562 | |||
563 | .size FUNC_NAME, .-FUNC_NAME | ||