diff options
Diffstat (limited to 'arch/ia64/lib/ip_fast_csum.S')
-rw-r--r-- | arch/ia64/lib/ip_fast_csum.S | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/arch/ia64/lib/ip_fast_csum.S b/arch/ia64/lib/ip_fast_csum.S new file mode 100644 index 000000000000..19674ca2acfc --- /dev/null +++ b/arch/ia64/lib/ip_fast_csum.S | |||
@@ -0,0 +1,90 @@ | |||
1 | /* | ||
2 | * Optmized version of the ip_fast_csum() function | ||
3 | * Used for calculating IP header checksum | ||
4 | * | ||
5 | * Return: 16bit checksum, complemented | ||
6 | * | ||
7 | * Inputs: | ||
8 | * in0: address of buffer to checksum (char *) | ||
9 | * in1: length of the buffer (int) | ||
10 | * | ||
11 | * Copyright (C) 2002 Intel Corp. | ||
12 | * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> | ||
13 | */ | ||
14 | |||
15 | #include <asm/asmmacro.h> | ||
16 | |||
17 | /* | ||
18 | * Since we know that most likely this function is called with buf aligned | ||
19 | * on 4-byte boundary and 20 bytes in length, we can execution rather quickly | ||
20 | * versus calling generic version of do_csum, which has lots of overhead in | ||
21 | * handling various alignments and sizes. However, due to lack of constrains | ||
22 | * put on the function input argument, cases with alignment not on 4-byte or | ||
23 | * size not equal to 20 bytes will be handled by the generic do_csum function. | ||
24 | */ | ||
25 | |||
26 | #define in0 r32 | ||
27 | #define in1 r33 | ||
28 | #define ret0 r8 | ||
29 | |||
30 | GLOBAL_ENTRY(ip_fast_csum) | ||
31 | .prologue | ||
32 | .body | ||
33 | cmp.ne p6,p7=5,in1 // size other than 20 byte? | ||
34 | and r14=3,in0 // is it aligned on 4-byte? | ||
35 | add r15=4,in0 // second source pointer | ||
36 | ;; | ||
37 | cmp.ne.or.andcm p6,p7=r14,r0 | ||
38 | ;; | ||
39 | (p7) ld4 r20=[in0],8 | ||
40 | (p7) ld4 r21=[r15],8 | ||
41 | (p6) br.spnt .generic | ||
42 | ;; | ||
43 | ld4 r22=[in0],8 | ||
44 | ld4 r23=[r15],8 | ||
45 | ;; | ||
46 | ld4 r24=[in0] | ||
47 | add r20=r20,r21 | ||
48 | add r22=r22,r23 | ||
49 | ;; | ||
50 | add r20=r20,r22 | ||
51 | ;; | ||
52 | add r20=r20,r24 | ||
53 | ;; | ||
54 | shr.u ret0=r20,16 // now need to add the carry | ||
55 | zxt2 r20=r20 | ||
56 | ;; | ||
57 | add r20=ret0,r20 | ||
58 | ;; | ||
59 | shr.u ret0=r20,16 // add carry again | ||
60 | zxt2 r20=r20 | ||
61 | ;; | ||
62 | add r20=ret0,r20 | ||
63 | ;; | ||
64 | shr.u ret0=r20,16 | ||
65 | zxt2 r20=r20 | ||
66 | ;; | ||
67 | add r20=ret0,r20 | ||
68 | ;; | ||
69 | andcm ret0=-1,r20 | ||
70 | .restore sp // reset frame state | ||
71 | br.ret.sptk.many b0 | ||
72 | ;; | ||
73 | |||
74 | .generic: | ||
75 | .prologue | ||
76 | .save ar.pfs, r35 | ||
77 | alloc r35=ar.pfs,2,2,2,0 | ||
78 | .save rp, r34 | ||
79 | mov r34=b0 | ||
80 | .body | ||
81 | dep.z out1=in1,2,30 | ||
82 | mov out0=in0 | ||
83 | ;; | ||
84 | br.call.sptk.many b0=do_csum | ||
85 | ;; | ||
86 | andcm ret0=-1,ret0 | ||
87 | mov ar.pfs=r35 | ||
88 | mov b0=r34 | ||
89 | br.ret.sptk.many b0 | ||
90 | END(ip_fast_csum) | ||