diff options
Diffstat (limited to 'arch/mips/lib/csum_partial.S')
-rw-r--r-- | arch/mips/lib/csum_partial.S | 293 |
1 files changed, 154 insertions, 139 deletions
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S index 15611d9df7ac..9db357294be1 100644 --- a/arch/mips/lib/csum_partial.S +++ b/arch/mips/lib/csum_partial.S | |||
@@ -12,43 +12,66 @@ | |||
12 | #include <asm/regdef.h> | 12 | #include <asm/regdef.h> |
13 | 13 | ||
14 | #ifdef CONFIG_64BIT | 14 | #ifdef CONFIG_64BIT |
15 | #define T0 ta0 | 15 | /* |
16 | #define T1 ta1 | 16 | * As we are sharing code base with the mips32 tree (which use the o32 ABI |
17 | #define T2 ta2 | 17 | * register definitions). We need to redefine the register definitions from |
18 | #define T3 ta3 | 18 | * the n64 ABI register naming to the o32 ABI register naming. |
19 | #define T4 t0 | 19 | */ |
20 | #define T7 t3 | 20 | #undef t0 |
21 | #else | 21 | #undef t1 |
22 | #define T0 t0 | 22 | #undef t2 |
23 | #define T1 t1 | 23 | #undef t3 |
24 | #define T2 t2 | 24 | #define t0 $8 |
25 | #define T3 t3 | 25 | #define t1 $9 |
26 | #define T4 t4 | 26 | #define t2 $10 |
27 | #define T7 t7 | 27 | #define t3 $11 |
28 | #define t4 $12 | ||
29 | #define t5 $13 | ||
30 | #define t6 $14 | ||
31 | #define t7 $15 | ||
32 | |||
33 | #define USE_DOUBLE | ||
28 | #endif | 34 | #endif |
29 | 35 | ||
36 | #ifdef USE_DOUBLE | ||
37 | |||
38 | #define LOAD ld | ||
39 | #define ADD daddu | ||
40 | #define NBYTES 8 | ||
41 | |||
42 | #else | ||
43 | |||
44 | #define LOAD lw | ||
45 | #define ADD addu | ||
46 | #define NBYTES 4 | ||
47 | |||
48 | #endif /* USE_DOUBLE */ | ||
49 | |||
50 | #define UNIT(unit) ((unit)*NBYTES) | ||
51 | |||
30 | #define ADDC(sum,reg) \ | 52 | #define ADDC(sum,reg) \ |
31 | addu sum, reg; \ | 53 | ADD sum, reg; \ |
32 | sltu v1, sum, reg; \ | 54 | sltu v1, sum, reg; \ |
33 | addu sum, v1 | 55 | ADD sum, v1 |
34 | 56 | ||
35 | #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ | 57 | #define CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) \ |
36 | lw _t0, (offset + 0x00)(src); \ | 58 | LOAD _t0, (offset + UNIT(0))(src); \ |
37 | lw _t1, (offset + 0x04)(src); \ | 59 | LOAD _t1, (offset + UNIT(1))(src); \ |
38 | lw _t2, (offset + 0x08)(src); \ | 60 | LOAD _t2, (offset + UNIT(2))(src); \ |
39 | lw _t3, (offset + 0x0c)(src); \ | 61 | LOAD _t3, (offset + UNIT(3))(src); \ |
40 | ADDC(sum, _t0); \ | ||
41 | ADDC(sum, _t1); \ | ||
42 | ADDC(sum, _t2); \ | ||
43 | ADDC(sum, _t3); \ | ||
44 | lw _t0, (offset + 0x10)(src); \ | ||
45 | lw _t1, (offset + 0x14)(src); \ | ||
46 | lw _t2, (offset + 0x18)(src); \ | ||
47 | lw _t3, (offset + 0x1c)(src); \ | ||
48 | ADDC(sum, _t0); \ | 62 | ADDC(sum, _t0); \ |
49 | ADDC(sum, _t1); \ | 63 | ADDC(sum, _t1); \ |
50 | ADDC(sum, _t2); \ | 64 | ADDC(sum, _t2); \ |
51 | ADDC(sum, _t3); \ | 65 | ADDC(sum, _t3) |
66 | |||
67 | #ifdef USE_DOUBLE | ||
68 | #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ | ||
69 | CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3) | ||
70 | #else | ||
71 | #define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \ | ||
72 | CSUM_BIGCHUNK1(src, offset, sum, _t0, _t1, _t2, _t3); \ | ||
73 | CSUM_BIGCHUNK1(src, offset + 0x10, sum, _t0, _t1, _t2, _t3) | ||
74 | #endif | ||
52 | 75 | ||
53 | /* | 76 | /* |
54 | * a0: source address | 77 | * a0: source address |
@@ -61,86 +84,27 @@ | |||
61 | 84 | ||
62 | .text | 85 | .text |
63 | .set noreorder | 86 | .set noreorder |
64 | |||
65 | /* unknown src alignment and < 8 bytes to go */ | ||
66 | small_csumcpy: | ||
67 | move a1, T2 | ||
68 | |||
69 | andi T0, a1, 4 | ||
70 | beqz T0, 1f | ||
71 | andi T0, a1, 2 | ||
72 | |||
73 | /* Still a full word to go */ | ||
74 | ulw T1, (src) | ||
75 | PTR_ADDIU src, 4 | ||
76 | ADDC(sum, T1) | ||
77 | |||
78 | 1: move T1, zero | ||
79 | beqz T0, 1f | ||
80 | andi T0, a1, 1 | ||
81 | |||
82 | /* Still a halfword to go */ | ||
83 | ulhu T1, (src) | ||
84 | PTR_ADDIU src, 2 | ||
85 | |||
86 | 1: beqz T0, 1f | ||
87 | sll T1, T1, 16 | ||
88 | |||
89 | lbu T2, (src) | ||
90 | nop | ||
91 | |||
92 | #ifdef __MIPSEB__ | ||
93 | sll T2, T2, 8 | ||
94 | #endif | ||
95 | or T1, T2 | ||
96 | |||
97 | 1: ADDC(sum, T1) | ||
98 | |||
99 | /* fold checksum */ | ||
100 | sll v1, sum, 16 | ||
101 | addu sum, v1 | ||
102 | sltu v1, sum, v1 | ||
103 | srl sum, sum, 16 | ||
104 | addu sum, v1 | ||
105 | |||
106 | /* odd buffer alignment? */ | ||
107 | beqz T7, 1f | ||
108 | nop | ||
109 | sll v1, sum, 8 | ||
110 | srl sum, sum, 8 | ||
111 | or sum, v1 | ||
112 | andi sum, 0xffff | ||
113 | 1: | ||
114 | .set reorder | ||
115 | /* Add the passed partial csum. */ | ||
116 | ADDC(sum, a2) | ||
117 | jr ra | ||
118 | .set noreorder | ||
119 | |||
120 | /* ------------------------------------------------------------------------- */ | ||
121 | |||
122 | .align 5 | 87 | .align 5 |
123 | LEAF(csum_partial) | 88 | LEAF(csum_partial) |
124 | move sum, zero | 89 | move sum, zero |
125 | move T7, zero | 90 | move t7, zero |
126 | 91 | ||
127 | sltiu t8, a1, 0x8 | 92 | sltiu t8, a1, 0x8 |
128 | bnez t8, small_csumcpy /* < 8 bytes to copy */ | 93 | bnez t8, small_csumcpy /* < 8 bytes to copy */ |
129 | move T2, a1 | 94 | move t2, a1 |
130 | 95 | ||
131 | beqz a1, out | 96 | andi t7, src, 0x1 /* odd buffer? */ |
132 | andi T7, src, 0x1 /* odd buffer? */ | ||
133 | 97 | ||
134 | hword_align: | 98 | hword_align: |
135 | beqz T7, word_align | 99 | beqz t7, word_align |
136 | andi t8, src, 0x2 | 100 | andi t8, src, 0x2 |
137 | 101 | ||
138 | lbu T0, (src) | 102 | lbu t0, (src) |
139 | LONG_SUBU a1, a1, 0x1 | 103 | LONG_SUBU a1, a1, 0x1 |
140 | #ifdef __MIPSEL__ | 104 | #ifdef __MIPSEL__ |
141 | sll T0, T0, 8 | 105 | sll t0, t0, 8 |
142 | #endif | 106 | #endif |
143 | ADDC(sum, T0) | 107 | ADDC(sum, t0) |
144 | PTR_ADDU src, src, 0x1 | 108 | PTR_ADDU src, src, 0x1 |
145 | andi t8, src, 0x2 | 109 | andi t8, src, 0x2 |
146 | 110 | ||
@@ -148,9 +112,9 @@ word_align: | |||
148 | beqz t8, dword_align | 112 | beqz t8, dword_align |
149 | sltiu t8, a1, 56 | 113 | sltiu t8, a1, 56 |
150 | 114 | ||
151 | lhu T0, (src) | 115 | lhu t0, (src) |
152 | LONG_SUBU a1, a1, 0x2 | 116 | LONG_SUBU a1, a1, 0x2 |
153 | ADDC(sum, T0) | 117 | ADDC(sum, t0) |
154 | sltiu t8, a1, 56 | 118 | sltiu t8, a1, 56 |
155 | PTR_ADDU src, src, 0x2 | 119 | PTR_ADDU src, src, 0x2 |
156 | 120 | ||
@@ -162,9 +126,9 @@ dword_align: | |||
162 | beqz t8, qword_align | 126 | beqz t8, qword_align |
163 | andi t8, src, 0x8 | 127 | andi t8, src, 0x8 |
164 | 128 | ||
165 | lw T0, 0x00(src) | 129 | lw t0, 0x00(src) |
166 | LONG_SUBU a1, a1, 0x4 | 130 | LONG_SUBU a1, a1, 0x4 |
167 | ADDC(sum, T0) | 131 | ADDC(sum, t0) |
168 | PTR_ADDU src, src, 0x4 | 132 | PTR_ADDU src, src, 0x4 |
169 | andi t8, src, 0x8 | 133 | andi t8, src, 0x8 |
170 | 134 | ||
@@ -172,11 +136,17 @@ qword_align: | |||
172 | beqz t8, oword_align | 136 | beqz t8, oword_align |
173 | andi t8, src, 0x10 | 137 | andi t8, src, 0x10 |
174 | 138 | ||
175 | lw T0, 0x00(src) | 139 | #ifdef USE_DOUBLE |
176 | lw T1, 0x04(src) | 140 | ld t0, 0x00(src) |
141 | LONG_SUBU a1, a1, 0x8 | ||
142 | ADDC(sum, t0) | ||
143 | #else | ||
144 | lw t0, 0x00(src) | ||
145 | lw t1, 0x04(src) | ||
177 | LONG_SUBU a1, a1, 0x8 | 146 | LONG_SUBU a1, a1, 0x8 |
178 | ADDC(sum, T0) | 147 | ADDC(sum, t0) |
179 | ADDC(sum, T1) | 148 | ADDC(sum, t1) |
149 | #endif | ||
180 | PTR_ADDU src, src, 0x8 | 150 | PTR_ADDU src, src, 0x8 |
181 | andi t8, src, 0x10 | 151 | andi t8, src, 0x10 |
182 | 152 | ||
@@ -184,75 +154,120 @@ oword_align: | |||
184 | beqz t8, begin_movement | 154 | beqz t8, begin_movement |
185 | LONG_SRL t8, a1, 0x7 | 155 | LONG_SRL t8, a1, 0x7 |
186 | 156 | ||
187 | lw T3, 0x08(src) | 157 | #ifdef USE_DOUBLE |
188 | lw T4, 0x0c(src) | 158 | ld t0, 0x00(src) |
189 | lw T0, 0x00(src) | 159 | ld t1, 0x08(src) |
190 | lw T1, 0x04(src) | 160 | ADDC(sum, t0) |
191 | ADDC(sum, T3) | 161 | ADDC(sum, t1) |
192 | ADDC(sum, T4) | 162 | #else |
193 | ADDC(sum, T0) | 163 | CSUM_BIGCHUNK1(src, 0x00, sum, t0, t1, t3, t4) |
194 | ADDC(sum, T1) | 164 | #endif |
195 | LONG_SUBU a1, a1, 0x10 | 165 | LONG_SUBU a1, a1, 0x10 |
196 | PTR_ADDU src, src, 0x10 | 166 | PTR_ADDU src, src, 0x10 |
197 | LONG_SRL t8, a1, 0x7 | 167 | LONG_SRL t8, a1, 0x7 |
198 | 168 | ||
199 | begin_movement: | 169 | begin_movement: |
200 | beqz t8, 1f | 170 | beqz t8, 1f |
201 | andi T2, a1, 0x40 | 171 | andi t2, a1, 0x40 |
202 | 172 | ||
203 | move_128bytes: | 173 | move_128bytes: |
204 | CSUM_BIGCHUNK(src, 0x00, sum, T0, T1, T3, T4) | 174 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
205 | CSUM_BIGCHUNK(src, 0x20, sum, T0, T1, T3, T4) | 175 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) |
206 | CSUM_BIGCHUNK(src, 0x40, sum, T0, T1, T3, T4) | 176 | CSUM_BIGCHUNK(src, 0x40, sum, t0, t1, t3, t4) |
207 | CSUM_BIGCHUNK(src, 0x60, sum, T0, T1, T3, T4) | 177 | CSUM_BIGCHUNK(src, 0x60, sum, t0, t1, t3, t4) |
208 | LONG_SUBU t8, t8, 0x01 | 178 | LONG_SUBU t8, t8, 0x01 |
209 | bnez t8, move_128bytes | 179 | bnez t8, move_128bytes |
210 | PTR_ADDU src, src, 0x80 | 180 | PTR_ADDU src, src, 0x80 |
211 | 181 | ||
212 | 1: | 182 | 1: |
213 | beqz T2, 1f | 183 | beqz t2, 1f |
214 | andi T2, a1, 0x20 | 184 | andi t2, a1, 0x20 |
215 | 185 | ||
216 | move_64bytes: | 186 | move_64bytes: |
217 | CSUM_BIGCHUNK(src, 0x00, sum, T0, T1, T3, T4) | 187 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
218 | CSUM_BIGCHUNK(src, 0x20, sum, T0, T1, T3, T4) | 188 | CSUM_BIGCHUNK(src, 0x20, sum, t0, t1, t3, t4) |
219 | PTR_ADDU src, src, 0x40 | 189 | PTR_ADDU src, src, 0x40 |
220 | 190 | ||
221 | 1: | 191 | 1: |
222 | beqz T2, do_end_words | 192 | beqz t2, do_end_words |
223 | andi t8, a1, 0x1c | 193 | andi t8, a1, 0x1c |
224 | 194 | ||
225 | move_32bytes: | 195 | move_32bytes: |
226 | CSUM_BIGCHUNK(src, 0x00, sum, T0, T1, T3, T4) | 196 | CSUM_BIGCHUNK(src, 0x00, sum, t0, t1, t3, t4) |
227 | andi t8, a1, 0x1c | 197 | andi t8, a1, 0x1c |
228 | PTR_ADDU src, src, 0x20 | 198 | PTR_ADDU src, src, 0x20 |
229 | 199 | ||
230 | do_end_words: | 200 | do_end_words: |
231 | beqz t8, maybe_end_cruft | 201 | beqz t8, small_csumcpy |
232 | LONG_SRL t8, t8, 0x2 | 202 | andi t2, a1, 0x3 |
203 | LONG_SRL t8, t8, 0x2 | ||
233 | 204 | ||
234 | end_words: | 205 | end_words: |
235 | lw T0, (src) | 206 | lw t0, (src) |
236 | LONG_SUBU t8, t8, 0x1 | 207 | LONG_SUBU t8, t8, 0x1 |
237 | ADDC(sum, T0) | 208 | ADDC(sum, t0) |
238 | bnez t8, end_words | 209 | bnez t8, end_words |
239 | PTR_ADDU src, src, 0x4 | 210 | PTR_ADDU src, src, 0x4 |
240 | 211 | ||
241 | maybe_end_cruft: | 212 | /* unknown src alignment and < 8 bytes to go */ |
242 | andi T2, a1, 0x3 | 213 | small_csumcpy: |
214 | move a1, t2 | ||
243 | 215 | ||
244 | small_memcpy: | 216 | andi t0, a1, 4 |
245 | j small_csumcpy; move a1, T2 /* XXX ??? */ | 217 | beqz t0, 1f |
246 | beqz t2, out | 218 | andi t0, a1, 2 |
247 | move a1, T2 | ||
248 | 219 | ||
249 | end_bytes: | 220 | /* Still a full word to go */ |
250 | lb T0, (src) | 221 | ulw t1, (src) |
251 | LONG_SUBU a1, a1, 0x1 | 222 | PTR_ADDIU src, 4 |
252 | bnez a2, end_bytes | 223 | ADDC(sum, t1) |
253 | PTR_ADDU src, src, 0x1 | 224 | |
225 | 1: move t1, zero | ||
226 | beqz t0, 1f | ||
227 | andi t0, a1, 1 | ||
228 | |||
229 | /* Still a halfword to go */ | ||
230 | ulhu t1, (src) | ||
231 | PTR_ADDIU src, 2 | ||
232 | |||
233 | 1: beqz t0, 1f | ||
234 | sll t1, t1, 16 | ||
235 | |||
236 | lbu t2, (src) | ||
237 | nop | ||
254 | 238 | ||
255 | out: | 239 | #ifdef __MIPSEB__ |
240 | sll t2, t2, 8 | ||
241 | #endif | ||
242 | or t1, t2 | ||
243 | |||
244 | 1: ADDC(sum, t1) | ||
245 | |||
246 | /* fold checksum */ | ||
247 | #ifdef USE_DOUBLE | ||
248 | dsll32 v1, sum, 0 | ||
249 | daddu sum, v1 | ||
250 | sltu v1, sum, v1 | ||
251 | dsra32 sum, sum, 0 | ||
252 | addu sum, v1 | ||
253 | #endif | ||
254 | sll v1, sum, 16 | ||
255 | addu sum, v1 | ||
256 | sltu v1, sum, v1 | ||
257 | srl sum, sum, 16 | ||
258 | addu sum, v1 | ||
259 | |||
260 | /* odd buffer alignment? */ | ||
261 | beqz t7, 1f | ||
262 | nop | ||
263 | sll v1, sum, 8 | ||
264 | srl sum, sum, 8 | ||
265 | or sum, v1 | ||
266 | andi sum, 0xffff | ||
267 | 1: | ||
268 | .set reorder | ||
269 | /* Add the passed partial csum. */ | ||
270 | ADDC(sum, a2) | ||
256 | jr ra | 271 | jr ra |
257 | move v0, sum | 272 | .set noreorder |
258 | END(csum_partial) | 273 | END(csum_partial) |