diff options
Diffstat (limited to 'arch/sparc64/lib/csum_copy.S')
-rw-r--r-- | arch/sparc64/lib/csum_copy.S | 308 |
1 files changed, 308 insertions, 0 deletions
diff --git a/arch/sparc64/lib/csum_copy.S b/arch/sparc64/lib/csum_copy.S new file mode 100644 index 000000000000..71af48839064 --- /dev/null +++ b/arch/sparc64/lib/csum_copy.S | |||
@@ -0,0 +1,308 @@ | |||
1 | /* csum_copy.S: Checksum+copy code for sparc64 | ||
2 | * | ||
3 | * Copyright (C) 2005 David S. Miller <davem@davemloft.net> | ||
4 | */ | ||
5 | |||
6 | #ifdef __KERNEL__ | ||
7 | #define GLOBAL_SPARE %g7 | ||
8 | #else | ||
9 | #define GLOBAL_SPARE %g5 | ||
10 | #endif | ||
11 | |||
12 | #ifndef EX_LD | ||
13 | #define EX_LD(x) x | ||
14 | #endif | ||
15 | |||
16 | #ifndef EX_ST | ||
17 | #define EX_ST(x) x | ||
18 | #endif | ||
19 | |||
20 | #ifndef EX_RETVAL | ||
21 | #define EX_RETVAL(x) x | ||
22 | #endif | ||
23 | |||
24 | #ifndef LOAD | ||
25 | #define LOAD(type,addr,dest) type [addr], dest | ||
26 | #endif | ||
27 | |||
28 | #ifndef STORE | ||
29 | #define STORE(type,src,addr) type src, [addr] | ||
30 | #endif | ||
31 | |||
32 | #ifndef FUNC_NAME | ||
33 | #define FUNC_NAME csum_partial_copy_nocheck | ||
34 | #endif | ||
35 | |||
36 | .register %g2, #scratch | ||
37 | .register %g3, #scratch | ||
38 | |||
39 | .text | ||
40 | |||
41 | 90: | ||
42 | /* We checked for zero length already, so there must be | ||
43 | * at least one byte. | ||
44 | */ | ||
45 | be,pt %icc, 1f | ||
46 | nop | ||
47 | EX_LD(LOAD(ldub, %o0 + 0x00, %o4)) | ||
48 | add %o0, 1, %o0 | ||
49 | sub %o2, 1, %o2 | ||
50 | EX_ST(STORE(stb, %o4, %o1 + 0x00)) | ||
51 | add %o1, 1, %o1 | ||
52 | 1: andcc %o0, 0x2, %g0 | ||
53 | be,pn %icc, 80f | ||
54 | cmp %o2, 2 | ||
55 | blu,pn %icc, 60f | ||
56 | nop | ||
57 | EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) | ||
58 | add %o0, 2, %o0 | ||
59 | sub %o2, 2, %o2 | ||
60 | EX_ST(STORE(sth, %o5, %o1 + 0x00)) | ||
61 | add %o1, 2, %o1 | ||
62 | ba,pt %xcc, 80f | ||
63 | add %o5, %o4, %o4 | ||
64 | |||
65 | .globl FUNC_NAME | ||
66 | FUNC_NAME: /* %o0=src, %o1=dst, %o2=len, %o3=sum */ | ||
67 | LOAD(prefetch, %o0 + 0x000, #n_reads) | ||
68 | xor %o0, %o1, %g1 | ||
69 | clr %o4 | ||
70 | andcc %g1, 0x3, %g0 | ||
71 | bne,pn %icc, 95f | ||
72 | LOAD(prefetch, %o0 + 0x040, #n_reads) | ||
73 | |||
74 | brz,pn %o2, 70f | ||
75 | andcc %o0, 0x3, %g0 | ||
76 | |||
77 | /* We "remember" whether the lowest bit in the address | ||
78 | * was set in GLOBAL_SPARE. Because if it is, we have to swap | ||
79 | * upper and lower 8 bit fields of the sum we calculate. | ||
80 | */ | ||
81 | bne,pn %icc, 90b | ||
82 | andcc %o0, 0x1, GLOBAL_SPARE | ||
83 | |||
84 | 80: | ||
85 | LOAD(prefetch, %o0 + 0x080, #n_reads) | ||
86 | andncc %o2, 0x3f, %g3 | ||
87 | |||
88 | LOAD(prefetch, %o0 + 0x0c0, #n_reads) | ||
89 | sub %o2, %g3, %o2 | ||
90 | brz,pn %g3, 2f | ||
91 | LOAD(prefetch, %o0 + 0x100, #n_reads) | ||
92 | |||
93 | /* So that we don't need to use the non-pairing | ||
94 | * add-with-carry instructions we accumulate 32-bit | ||
95 | * values into a 64-bit register. At the end of the | ||
96 | * loop we fold it down to 32-bits and so on. | ||
97 | */ | ||
98 | ba,pt %xcc, 1f | ||
99 | LOAD(prefetch, %o0 + 0x140, #n_reads) | ||
100 | |||
101 | .align 32 | ||
102 | 1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) | ||
103 | EX_LD(LOAD(lduw, %o0 + 0x04, %g1)) | ||
104 | EX_LD(LOAD(lduw, %o0 + 0x08, %g2)) | ||
105 | add %o4, %o5, %o4 | ||
106 | EX_ST(STORE(stw, %o5, %o1 + 0x00)) | ||
107 | EX_LD(LOAD(lduw, %o0 + 0x0c, %o5)) | ||
108 | add %o4, %g1, %o4 | ||
109 | EX_ST(STORE(stw, %g1, %o1 + 0x04)) | ||
110 | EX_LD(LOAD(lduw, %o0 + 0x10, %g1)) | ||
111 | add %o4, %g2, %o4 | ||
112 | EX_ST(STORE(stw, %g2, %o1 + 0x08)) | ||
113 | EX_LD(LOAD(lduw, %o0 + 0x14, %g2)) | ||
114 | add %o4, %o5, %o4 | ||
115 | EX_ST(STORE(stw, %o5, %o1 + 0x0c)) | ||
116 | EX_LD(LOAD(lduw, %o0 + 0x18, %o5)) | ||
117 | add %o4, %g1, %o4 | ||
118 | EX_ST(STORE(stw, %g1, %o1 + 0x10)) | ||
119 | EX_LD(LOAD(lduw, %o0 + 0x1c, %g1)) | ||
120 | add %o4, %g2, %o4 | ||
121 | EX_ST(STORE(stw, %g2, %o1 + 0x14)) | ||
122 | EX_LD(LOAD(lduw, %o0 + 0x20, %g2)) | ||
123 | add %o4, %o5, %o4 | ||
124 | EX_ST(STORE(stw, %o5, %o1 + 0x18)) | ||
125 | EX_LD(LOAD(lduw, %o0 + 0x24, %o5)) | ||
126 | add %o4, %g1, %o4 | ||
127 | EX_ST(STORE(stw, %g1, %o1 + 0x1c)) | ||
128 | EX_LD(LOAD(lduw, %o0 + 0x28, %g1)) | ||
129 | add %o4, %g2, %o4 | ||
130 | EX_ST(STORE(stw, %g2, %o1 + 0x20)) | ||
131 | EX_LD(LOAD(lduw, %o0 + 0x2c, %g2)) | ||
132 | add %o4, %o5, %o4 | ||
133 | EX_ST(STORE(stw, %o5, %o1 + 0x24)) | ||
134 | EX_LD(LOAD(lduw, %o0 + 0x30, %o5)) | ||
135 | add %o4, %g1, %o4 | ||
136 | EX_ST(STORE(stw, %g1, %o1 + 0x28)) | ||
137 | EX_LD(LOAD(lduw, %o0 + 0x34, %g1)) | ||
138 | add %o4, %g2, %o4 | ||
139 | EX_ST(STORE(stw, %g2, %o1 + 0x2c)) | ||
140 | EX_LD(LOAD(lduw, %o0 + 0x38, %g2)) | ||
141 | add %o4, %o5, %o4 | ||
142 | EX_ST(STORE(stw, %o5, %o1 + 0x30)) | ||
143 | EX_LD(LOAD(lduw, %o0 + 0x3c, %o5)) | ||
144 | add %o4, %g1, %o4 | ||
145 | EX_ST(STORE(stw, %g1, %o1 + 0x34)) | ||
146 | LOAD(prefetch, %o0 + 0x180, #n_reads) | ||
147 | add %o4, %g2, %o4 | ||
148 | EX_ST(STORE(stw, %g2, %o1 + 0x38)) | ||
149 | subcc %g3, 0x40, %g3 | ||
150 | add %o0, 0x40, %o0 | ||
151 | add %o4, %o5, %o4 | ||
152 | EX_ST(STORE(stw, %o5, %o1 + 0x3c)) | ||
153 | bne,pt %icc, 1b | ||
154 | add %o1, 0x40, %o1 | ||
155 | |||
156 | 2: and %o2, 0x3c, %g3 | ||
157 | brz,pn %g3, 2f | ||
158 | sub %o2, %g3, %o2 | ||
159 | 1: EX_LD(LOAD(lduw, %o0 + 0x00, %o5)) | ||
160 | subcc %g3, 0x4, %g3 | ||
161 | add %o0, 0x4, %o0 | ||
162 | add %o4, %o5, %o4 | ||
163 | EX_ST(STORE(stw, %o5, %o1 + 0x00)) | ||
164 | bne,pt %icc, 1b | ||
165 | add %o1, 0x4, %o1 | ||
166 | |||
167 | 2: | ||
168 | /* fold 64-->32 */ | ||
169 | srlx %o4, 32, %o5 | ||
170 | srl %o4, 0, %o4 | ||
171 | add %o4, %o5, %o4 | ||
172 | srlx %o4, 32, %o5 | ||
173 | srl %o4, 0, %o4 | ||
174 | add %o4, %o5, %o4 | ||
175 | |||
176 | /* fold 32-->16 */ | ||
177 | sethi %hi(0xffff0000), %g1 | ||
178 | srl %o4, 16, %o5 | ||
179 | andn %o4, %g1, %g2 | ||
180 | add %o5, %g2, %o4 | ||
181 | srl %o4, 16, %o5 | ||
182 | andn %o4, %g1, %g2 | ||
183 | add %o5, %g2, %o4 | ||
184 | |||
185 | 60: | ||
186 | /* %o4 has the 16-bit sum we have calculated so-far. */ | ||
187 | cmp %o2, 2 | ||
188 | blu,pt %icc, 1f | ||
189 | nop | ||
190 | EX_LD(LOAD(lduh, %o0 + 0x00, %o5)) | ||
191 | sub %o2, 2, %o2 | ||
192 | add %o0, 2, %o0 | ||
193 | add %o4, %o5, %o4 | ||
194 | EX_ST(STORE(sth, %o5, %o1 + 0x00)) | ||
195 | add %o1, 0x2, %o1 | ||
196 | 1: brz,pt %o2, 1f | ||
197 | nop | ||
198 | EX_LD(LOAD(ldub, %o0 + 0x00, %o5)) | ||
199 | sub %o2, 1, %o2 | ||
200 | add %o0, 1, %o0 | ||
201 | EX_ST(STORE(stb, %o5, %o1 + 0x00)) | ||
202 | sllx %o5, 8, %o5 | ||
203 | add %o1, 1, %o1 | ||
204 | add %o4, %o5, %o4 | ||
205 | 1: | ||
206 | /* fold 32-->16 */ | ||
207 | sethi %hi(0xffff0000), %g1 | ||
208 | srl %o4, 16, %o5 | ||
209 | andn %o4, %g1, %g2 | ||
210 | add %o5, %g2, %o4 | ||
211 | srl %o4, 16, %o5 | ||
212 | andn %o4, %g1, %g2 | ||
213 | add %o5, %g2, %o4 | ||
214 | |||
215 | 1: brz,pt GLOBAL_SPARE, 1f | ||
216 | nop | ||
217 | |||
218 | /* We started with an odd byte, byte-swap the result. */ | ||
219 | srl %o4, 8, %o5 | ||
220 | and %o4, 0xff, %g1 | ||
221 | sll %g1, 8, %g1 | ||
222 | or %o5, %g1, %o4 | ||
223 | |||
224 | 1: add %o3, %o4, %o3 | ||
225 | |||
226 | 70: | ||
227 | retl | ||
228 | mov %o3, %o0 | ||
229 | |||
230 | 95: mov 0, GLOBAL_SPARE | ||
231 | brlez,pn %o2, 4f | ||
232 | andcc %o0, 1, %o5 | ||
233 | be,a,pt %icc, 1f | ||
234 | srl %o2, 1, %g1 | ||
235 | sub %o2, 1, %o2 | ||
236 | EX_LD(LOAD(ldub, %o0, GLOBAL_SPARE)) | ||
237 | add %o0, 1, %o0 | ||
238 | EX_ST(STORE(stb, GLOBAL_SPARE, %o1)) | ||
239 | srl %o2, 1, %g1 | ||
240 | add %o1, 1, %o1 | ||
241 | 1: brz,a,pn %g1, 3f | ||
242 | andcc %o2, 1, %g0 | ||
243 | andcc %o0, 2, %g0 | ||
244 | be,a,pt %icc, 1f | ||
245 | srl %g1, 1, %g1 | ||
246 | EX_LD(LOAD(lduh, %o0, %o4)) | ||
247 | sub %o2, 2, %o2 | ||
248 | srl %o4, 8, %g2 | ||
249 | sub %g1, 1, %g1 | ||
250 | EX_ST(STORE(stb, %g2, %o1)) | ||
251 | add %o4, GLOBAL_SPARE, GLOBAL_SPARE | ||
252 | EX_ST(STORE(stb, %o4, %o1 + 1)) | ||
253 | add %o0, 2, %o0 | ||
254 | srl %g1, 1, %g1 | ||
255 | add %o1, 2, %o1 | ||
256 | 1: brz,a,pn %g1, 2f | ||
257 | andcc %o2, 2, %g0 | ||
258 | EX_LD(LOAD(lduw, %o0, %o4)) | ||
259 | 5: srl %o4, 24, %g2 | ||
260 | srl %o4, 16, %g3 | ||
261 | EX_ST(STORE(stb, %g2, %o1)) | ||
262 | srl %o4, 8, %g2 | ||
263 | EX_ST(STORE(stb, %g3, %o1 + 1)) | ||
264 | add %o0, 4, %o0 | ||
265 | EX_ST(STORE(stb, %g2, %o1 + 2)) | ||
266 | addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE | ||
267 | EX_ST(STORE(stb, %o4, %o1 + 3)) | ||
268 | addc GLOBAL_SPARE, %g0, GLOBAL_SPARE | ||
269 | add %o1, 4, %o1 | ||
270 | subcc %g1, 1, %g1 | ||
271 | bne,a,pt %icc, 5b | ||
272 | EX_LD(LOAD(lduw, %o0, %o4)) | ||
273 | sll GLOBAL_SPARE, 16, %g2 | ||
274 | srl GLOBAL_SPARE, 16, GLOBAL_SPARE | ||
275 | srl %g2, 16, %g2 | ||
276 | andcc %o2, 2, %g0 | ||
277 | add %g2, GLOBAL_SPARE, GLOBAL_SPARE | ||
278 | 2: be,a,pt %icc, 3f | ||
279 | andcc %o2, 1, %g0 | ||
280 | EX_LD(LOAD(lduh, %o0, %o4)) | ||
281 | andcc %o2, 1, %g0 | ||
282 | srl %o4, 8, %g2 | ||
283 | add %o0, 2, %o0 | ||
284 | EX_ST(STORE(stb, %g2, %o1)) | ||
285 | add GLOBAL_SPARE, %o4, GLOBAL_SPARE | ||
286 | EX_ST(STORE(stb, %o4, %o1 + 1)) | ||
287 | add %o1, 2, %o1 | ||
288 | 3: be,a,pt %icc, 1f | ||
289 | sll GLOBAL_SPARE, 16, %o4 | ||
290 | EX_LD(LOAD(ldub, %o0, %g2)) | ||
291 | sll %g2, 8, %o4 | ||
292 | EX_ST(STORE(stb, %g2, %o1)) | ||
293 | add GLOBAL_SPARE, %o4, GLOBAL_SPARE | ||
294 | sll GLOBAL_SPARE, 16, %o4 | ||
295 | 1: addcc %o4, GLOBAL_SPARE, GLOBAL_SPARE | ||
296 | srl GLOBAL_SPARE, 16, %o4 | ||
297 | addc %g0, %o4, GLOBAL_SPARE | ||
298 | brz,pt %o5, 4f | ||
299 | srl GLOBAL_SPARE, 8, %o4 | ||
300 | and GLOBAL_SPARE, 0xff, %g2 | ||
301 | and %o4, 0xff, %o4 | ||
302 | sll %g2, 8, %g2 | ||
303 | or %g2, %o4, GLOBAL_SPARE | ||
304 | 4: addcc %o3, GLOBAL_SPARE, %o3 | ||
305 | addc %g0, %o3, %o0 | ||
306 | retl | ||
307 | srl %o0, 0, %o0 | ||
308 | .size FUNC_NAME, .-FUNC_NAME | ||