diff options
author | Chris Zankel <czankel@tensilica.com> | 2005-06-24 01:01:20 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@ppc970.osdl.org> | 2005-06-24 03:05:21 -0400 |
commit | 249ac17e96811acc3c6402317dd5d5c89d2cbf68 (patch) | |
tree | 0a174065460de196861b85f1d9a48c88b2a2675a /arch/xtensa/lib/checksum.S | |
parent | 5a0015d62668e64c8b6e02e360fbbea121bfd5e6 (diff) |
[PATCH] xtensa: Architecture support for Tensilica Xtensa Part 4
The attached patches provides part 4 of an architecture implementation for the
Tensilica Xtensa CPU series.
Signed-off-by: Chris Zankel <chris@zankel.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'arch/xtensa/lib/checksum.S')
-rw-r--r-- | arch/xtensa/lib/checksum.S | 410 |
1 files changed, 410 insertions, 0 deletions
diff --git a/arch/xtensa/lib/checksum.S b/arch/xtensa/lib/checksum.S new file mode 100644 index 000000000000..e2d64dfd530c --- /dev/null +++ b/arch/xtensa/lib/checksum.S | |||
@@ -0,0 +1,410 @@ | |||
1 | /* | ||
2 | * INET An implementation of the TCP/IP protocol suite for the LINUX | ||
3 | * operating system. INET is implemented using the BSD Socket | ||
4 | * interface as the means of communication with the user level. | ||
5 | * | ||
6 | * IP/TCP/UDP checksumming routines | ||
7 | * | ||
8 | * Xtensa version: Copyright (C) 2001 Tensilica, Inc. by Kevin Chea | ||
9 | * Optimized by Joe Taylor | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public License | ||
13 | * as published by the Free Software Foundation; either version | ||
14 | * 2 of the License, or (at your option) any later version. | ||
15 | */ | ||
16 | |||
17 | #include <asm/errno.h> | ||
18 | #include <linux/linkage.h> | ||
19 | #define _ASMLANGUAGE | ||
20 | #include <xtensa/config/core.h> | ||
21 | |||
22 | /* | ||
23 | * computes a partial checksum, e.g. for TCP/UDP fragments | ||
24 | */ | ||
25 | |||
26 | /* | ||
27 | * unsigned int csum_partial(const unsigned char *buf, int len, | ||
28 | * unsigned int sum); | ||
29 | * a2 = buf | ||
30 | * a3 = len | ||
31 | * a4 = sum | ||
32 | * | ||
33 | * This function assumes 2- or 4-byte alignment. Other alignments will fail! | ||
34 | */ | ||
35 | |||
36 | /* ONES_ADD converts twos-complement math to ones-complement. */ | ||
37 | #define ONES_ADD(sum, val) \ | ||
38 | add sum, sum, val ; \ | ||
39 | bgeu sum, val, 99f ; \ | ||
40 | addi sum, sum, 1 ; \ | ||
41 | 99: ; | ||
42 | |||
43 | .text | ||
44 | ENTRY(csum_partial) | ||
45 | /* | ||
46 | * Experiments with Ethernet and SLIP connections show that buf | ||
47 | * is aligned on either a 2-byte or 4-byte boundary. | ||
48 | */ | ||
49 | entry sp, 32 | ||
50 | extui a5, a2, 0, 2 | ||
51 | bnez a5, 8f /* branch if 2-byte aligned */ | ||
52 | /* Fall-through on common case, 4-byte alignment */ | ||
53 | 1: | ||
54 | srli a5, a3, 5 /* 32-byte chunks */ | ||
55 | #if XCHAL_HAVE_LOOPS | ||
56 | loopgtz a5, 2f | ||
57 | #else | ||
58 | beqz a5, 2f | ||
59 | slli a5, a5, 5 | ||
60 | add a5, a5, a2 /* a5 = end of last 32-byte chunk */ | ||
61 | .Loop1: | ||
62 | #endif | ||
63 | l32i a6, a2, 0 | ||
64 | l32i a7, a2, 4 | ||
65 | ONES_ADD(a4, a6) | ||
66 | ONES_ADD(a4, a7) | ||
67 | l32i a6, a2, 8 | ||
68 | l32i a7, a2, 12 | ||
69 | ONES_ADD(a4, a6) | ||
70 | ONES_ADD(a4, a7) | ||
71 | l32i a6, a2, 16 | ||
72 | l32i a7, a2, 20 | ||
73 | ONES_ADD(a4, a6) | ||
74 | ONES_ADD(a4, a7) | ||
75 | l32i a6, a2, 24 | ||
76 | l32i a7, a2, 28 | ||
77 | ONES_ADD(a4, a6) | ||
78 | ONES_ADD(a4, a7) | ||
79 | addi a2, a2, 4*8 | ||
80 | #if !XCHAL_HAVE_LOOPS | ||
81 | blt a2, a5, .Loop1 | ||
82 | #endif | ||
83 | 2: | ||
84 | extui a5, a3, 2, 3 /* remaining 4-byte chunks */ | ||
85 | #if XCHAL_HAVE_LOOPS | ||
86 | loopgtz a5, 3f | ||
87 | #else | ||
88 | beqz a5, 3f | ||
89 | slli a5, a5, 2 | ||
90 | add a5, a5, a2 /* a5 = end of last 4-byte chunk */ | ||
91 | .Loop2: | ||
92 | #endif | ||
93 | l32i a6, a2, 0 | ||
94 | ONES_ADD(a4, a6) | ||
95 | addi a2, a2, 4 | ||
96 | #if !XCHAL_HAVE_LOOPS | ||
97 | blt a2, a5, .Loop2 | ||
98 | #endif | ||
99 | 3: | ||
100 | _bbci.l a3, 1, 5f /* remaining 2-byte chunk */ | ||
101 | l16ui a6, a2, 0 | ||
102 | ONES_ADD(a4, a6) | ||
103 | addi a2, a2, 2 | ||
104 | 5: | ||
105 | _bbci.l a3, 0, 7f /* remaining 1-byte chunk */ | ||
106 | 6: l8ui a6, a2, 0 | ||
107 | #ifdef __XTENSA_EB__ | ||
108 | slli a6, a6, 8 /* load byte into bits 8..15 */ | ||
109 | #endif | ||
110 | ONES_ADD(a4, a6) | ||
111 | 7: | ||
112 | mov a2, a4 | ||
113 | retw | ||
114 | |||
115 | /* uncommon case, buf is 2-byte aligned */ | ||
116 | 8: | ||
117 | beqz a3, 7b /* branch if len == 0 */ | ||
118 | beqi a3, 1, 6b /* branch if len == 1 */ | ||
119 | |||
120 | extui a5, a2, 0, 1 | ||
121 | bnez a5, 8f /* branch if 1-byte aligned */ | ||
122 | |||
123 | l16ui a6, a2, 0 /* common case, len >= 2 */ | ||
124 | ONES_ADD(a4, a6) | ||
125 | addi a2, a2, 2 /* adjust buf */ | ||
126 | addi a3, a3, -2 /* adjust len */ | ||
127 | j 1b /* now buf is 4-byte aligned */ | ||
128 | |||
129 | /* case: odd-byte aligned, len > 1 | ||
130 | * This case is dog slow, so don't give us an odd address. | ||
131 | * (I don't think this ever happens, but just in case.) | ||
132 | */ | ||
133 | 8: | ||
134 | srli a5, a3, 2 /* 4-byte chunks */ | ||
135 | #if XCHAL_HAVE_LOOPS | ||
136 | loopgtz a5, 2f | ||
137 | #else | ||
138 | beqz a5, 2f | ||
139 | slli a5, a5, 2 | ||
140 | add a5, a5, a2 /* a5 = end of last 4-byte chunk */ | ||
141 | .Loop3: | ||
142 | #endif | ||
143 | l8ui a6, a2, 0 /* bits 24..31 */ | ||
144 | l16ui a7, a2, 1 /* bits 8..23 */ | ||
145 | l8ui a8, a2, 3 /* bits 0.. 8 */ | ||
146 | #ifdef __XTENSA_EB__ | ||
147 | slli a6, a6, 24 | ||
148 | #else | ||
149 | slli a8, a8, 24 | ||
150 | #endif | ||
151 | slli a7, a7, 8 | ||
152 | or a7, a7, a6 | ||
153 | or a7, a7, a8 | ||
154 | ONES_ADD(a4, a7) | ||
155 | addi a2, a2, 4 | ||
156 | #if !XCHAL_HAVE_LOOPS | ||
157 | blt a2, a5, .Loop3 | ||
158 | #endif | ||
159 | 2: | ||
160 | _bbci.l a3, 1, 3f /* remaining 2-byte chunk, still odd addr */ | ||
161 | l8ui a6, a2, 0 | ||
162 | l8ui a7, a2, 1 | ||
163 | #ifdef __XTENSA_EB__ | ||
164 | slli a6, a6, 8 | ||
165 | #else | ||
166 | slli a7, a7, 8 | ||
167 | #endif | ||
168 | or a7, a7, a6 | ||
169 | ONES_ADD(a4, a7) | ||
170 | addi a2, a2, 2 | ||
171 | 3: | ||
172 | j 5b /* branch to handle the remaining byte */ | ||
173 | |||
174 | |||
175 | |||
176 | /* | ||
177 | * Copy from ds while checksumming, otherwise like csum_partial | ||
178 | * | ||
179 | * The macros SRC and DST specify the type of access for the instruction. | ||
180 | * thus we can call a custom exception handler for each access type. | ||
181 | */ | ||
182 | |||
183 | #define SRC(y...) \ | ||
184 | 9999: y; \ | ||
185 | .section __ex_table, "a"; \ | ||
186 | .long 9999b, 6001f ; \ | ||
187 | .previous | ||
188 | |||
189 | #define DST(y...) \ | ||
190 | 9999: y; \ | ||
191 | .section __ex_table, "a"; \ | ||
192 | .long 9999b, 6002f ; \ | ||
193 | .previous | ||
194 | |||
195 | /* | ||
196 | unsigned int csum_partial_copy_generic (const char *src, char *dst, int len, | ||
197 | int sum, int *src_err_ptr, int *dst_err_ptr) | ||
198 | a2 = src | ||
199 | a3 = dst | ||
200 | a4 = len | ||
201 | a5 = sum | ||
202 | a6 = src_err_ptr | ||
203 | a7 = dst_err_ptr | ||
204 | a8 = temp | ||
205 | a9 = temp | ||
206 | a10 = temp | ||
207 | a11 = original len for exception handling | ||
208 | a12 = original dst for exception handling | ||
209 | |||
210 | This function is optimized for 4-byte aligned addresses. Other | ||
211 | alignments work, but not nearly as efficiently. | ||
212 | */ | ||
213 | |||
214 | ENTRY(csum_partial_copy_generic) | ||
215 | entry sp, 32 | ||
216 | mov a12, a3 | ||
217 | mov a11, a4 | ||
218 | or a10, a2, a3 | ||
219 | |||
220 | /* We optimize the following alignment tests for the 4-byte | ||
221 | aligned case. Two bbsi.l instructions might seem more optimal | ||
222 | (commented out below). However, both labels 5: and 3: are out | ||
223 | of the imm8 range, so the assembler relaxes them into | ||
224 | equivalent bbci.l, j combinations, which is actually | ||
225 | slower. */ | ||
226 | |||
227 | extui a9, a10, 0, 2 | ||
228 | beqz a9, 1f /* branch if both are 4-byte aligned */ | ||
229 | bbsi.l a10, 0, 5f /* branch if one address is odd */ | ||
230 | j 3f /* one address is 2-byte aligned */ | ||
231 | |||
232 | /* _bbsi.l a10, 0, 5f */ /* branch if odd address */ | ||
233 | /* _bbsi.l a10, 1, 3f */ /* branch if 2-byte-aligned address */ | ||
234 | |||
235 | 1: | ||
236 | /* src and dst are both 4-byte aligned */ | ||
237 | srli a10, a4, 5 /* 32-byte chunks */ | ||
238 | #if XCHAL_HAVE_LOOPS | ||
239 | loopgtz a10, 2f | ||
240 | #else | ||
241 | beqz a10, 2f | ||
242 | slli a10, a10, 5 | ||
243 | add a10, a10, a2 /* a10 = end of last 32-byte src chunk */ | ||
244 | .Loop5: | ||
245 | #endif | ||
246 | SRC( l32i a9, a2, 0 ) | ||
247 | SRC( l32i a8, a2, 4 ) | ||
248 | DST( s32i a9, a3, 0 ) | ||
249 | DST( s32i a8, a3, 4 ) | ||
250 | ONES_ADD(a5, a9) | ||
251 | ONES_ADD(a5, a8) | ||
252 | SRC( l32i a9, a2, 8 ) | ||
253 | SRC( l32i a8, a2, 12 ) | ||
254 | DST( s32i a9, a3, 8 ) | ||
255 | DST( s32i a8, a3, 12 ) | ||
256 | ONES_ADD(a5, a9) | ||
257 | ONES_ADD(a5, a8) | ||
258 | SRC( l32i a9, a2, 16 ) | ||
259 | SRC( l32i a8, a2, 20 ) | ||
260 | DST( s32i a9, a3, 16 ) | ||
261 | DST( s32i a8, a3, 20 ) | ||
262 | ONES_ADD(a5, a9) | ||
263 | ONES_ADD(a5, a8) | ||
264 | SRC( l32i a9, a2, 24 ) | ||
265 | SRC( l32i a8, a2, 28 ) | ||
266 | DST( s32i a9, a3, 24 ) | ||
267 | DST( s32i a8, a3, 28 ) | ||
268 | ONES_ADD(a5, a9) | ||
269 | ONES_ADD(a5, a8) | ||
270 | addi a2, a2, 32 | ||
271 | addi a3, a3, 32 | ||
272 | #if !XCHAL_HAVE_LOOPS | ||
273 | blt a2, a10, .Loop5 | ||
274 | #endif | ||
275 | 2: | ||
276 | extui a10, a4, 2, 3 /* remaining 4-byte chunks */ | ||
277 | extui a4, a4, 0, 2 /* reset len for general-case, 2-byte chunks */ | ||
278 | #if XCHAL_HAVE_LOOPS | ||
279 | loopgtz a10, 3f | ||
280 | #else | ||
281 | beqz a10, 3f | ||
282 | slli a10, a10, 2 | ||
283 | add a10, a10, a2 /* a10 = end of last 4-byte src chunk */ | ||
284 | .Loop6: | ||
285 | #endif | ||
286 | SRC( l32i a9, a2, 0 ) | ||
287 | DST( s32i a9, a3, 0 ) | ||
288 | ONES_ADD(a5, a9) | ||
289 | addi a2, a2, 4 | ||
290 | addi a3, a3, 4 | ||
291 | #if !XCHAL_HAVE_LOOPS | ||
292 | blt a2, a10, .Loop6 | ||
293 | #endif | ||
294 | 3: | ||
295 | /* | ||
296 | Control comes to here in two cases: (1) It may fall through | ||
297 | to here from the 4-byte alignment case to process, at most, | ||
298 | one 2-byte chunk. (2) It branches to here from above if | ||
299 | either src or dst is 2-byte aligned, and we process all bytes | ||
300 | here, except for perhaps a trailing odd byte. It's | ||
301 | inefficient, so align your addresses to 4-byte boundaries. | ||
302 | |||
303 | a2 = src | ||
304 | a3 = dst | ||
305 | a4 = len | ||
306 | a5 = sum | ||
307 | */ | ||
308 | srli a10, a4, 1 /* 2-byte chunks */ | ||
309 | #if XCHAL_HAVE_LOOPS | ||
310 | loopgtz a10, 4f | ||
311 | #else | ||
312 | beqz a10, 4f | ||
313 | slli a10, a10, 1 | ||
314 | add a10, a10, a2 /* a10 = end of last 2-byte src chunk */ | ||
315 | .Loop7: | ||
316 | #endif | ||
317 | SRC( l16ui a9, a2, 0 ) | ||
318 | DST( s16i a9, a3, 0 ) | ||
319 | ONES_ADD(a5, a9) | ||
320 | addi a2, a2, 2 | ||
321 | addi a3, a3, 2 | ||
322 | #if !XCHAL_HAVE_LOOPS | ||
323 | blt a2, a10, .Loop7 | ||
324 | #endif | ||
325 | 4: | ||
326 | /* This section processes a possible trailing odd byte. */ | ||
327 | _bbci.l a4, 0, 8f /* 1-byte chunk */ | ||
328 | SRC( l8ui a9, a2, 0 ) | ||
329 | DST( s8i a9, a3, 0 ) | ||
330 | #ifdef __XTENSA_EB__ | ||
331 | slli a9, a9, 8 /* shift byte to bits 8..15 */ | ||
332 | #endif | ||
333 | ONES_ADD(a5, a9) | ||
334 | 8: | ||
335 | mov a2, a5 | ||
336 | retw | ||
337 | |||
338 | 5: | ||
339 | /* Control branch to here when either src or dst is odd. We | ||
340 | process all bytes using 8-bit accesses. Grossly inefficient, | ||
341 | so don't feed us an odd address. */ | ||
342 | |||
343 | srli a10, a4, 1 /* handle in pairs for 16-bit csum */ | ||
344 | #if XCHAL_HAVE_LOOPS | ||
345 | loopgtz a10, 6f | ||
346 | #else | ||
347 | beqz a10, 6f | ||
348 | slli a10, a10, 1 | ||
349 | add a10, a10, a2 /* a10 = end of last odd-aligned, 2-byte src chunk */ | ||
350 | .Loop8: | ||
351 | #endif | ||
352 | SRC( l8ui a9, a2, 0 ) | ||
353 | SRC( l8ui a8, a2, 1 ) | ||
354 | DST( s8i a9, a3, 0 ) | ||
355 | DST( s8i a8, a3, 1 ) | ||
356 | #ifdef __XTENSA_EB__ | ||
357 | slli a9, a9, 8 /* combine into a single 16-bit value */ | ||
358 | #else /* for checksum computation */ | ||
359 | slli a8, a8, 8 | ||
360 | #endif | ||
361 | or a9, a9, a8 | ||
362 | ONES_ADD(a5, a9) | ||
363 | addi a2, a2, 2 | ||
364 | addi a3, a3, 2 | ||
365 | #if !XCHAL_HAVE_LOOPS | ||
366 | blt a2, a10, .Loop8 | ||
367 | #endif | ||
368 | 6: | ||
369 | j 4b /* process the possible trailing odd byte */ | ||
370 | |||
371 | |||
372 | # Exception handler: | ||
373 | .section .fixup, "ax" | ||
374 | /* | ||
375 | a6 = src_err_ptr | ||
376 | a7 = dst_err_ptr | ||
377 | a11 = original len for exception handling | ||
378 | a12 = original dst for exception handling | ||
379 | */ | ||
380 | |||
381 | 6001: | ||
382 | _movi a2, -EFAULT | ||
383 | s32i a2, a6, 0 /* src_err_ptr */ | ||
384 | |||
385 | # clear the complete destination - computing the rest | ||
386 | # is too much work | ||
387 | movi a2, 0 | ||
388 | #if XCHAL_HAVE_LOOPS | ||
389 | loopgtz a11, 2f | ||
390 | #else | ||
391 | beqz a11, 2f | ||
392 | add a11, a11, a12 /* a11 = ending address */ | ||
393 | .Leloop: | ||
394 | #endif | ||
395 | s8i a2, a12, 0 | ||
396 | addi a12, a12, 1 | ||
397 | #if !XCHAL_HAVE_LOOPS | ||
398 | blt a12, a11, .Leloop | ||
399 | #endif | ||
400 | 2: | ||
401 | retw | ||
402 | |||
403 | 6002: | ||
404 | movi a2, -EFAULT | ||
405 | s32i a2, a7, 0 /* dst_err_ptr */ | ||
406 | movi a2, 0 | ||
407 | retw | ||
408 | |||
409 | .previous | ||
410 | |||