aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib/checksum_64.S
diff options
context:
space:
mode:
authorGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
committerGlenn Elliott <gelliott@cs.unc.edu>2012-03-04 19:47:13 -0500
commitc71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
treeecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/powerpc/lib/checksum_64.S
parentea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp
Conflicts: litmus/sched_cedf.c
Diffstat (limited to 'arch/powerpc/lib/checksum_64.S')
-rw-r--r--arch/powerpc/lib/checksum_64.S482
1 files changed, 355 insertions, 127 deletions
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index ef96c6c58efc..18245af38aea 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -65,165 +65,393 @@ _GLOBAL(csum_tcpudp_magic)
65 srwi r3,r3,16 65 srwi r3,r3,16
66 blr 66 blr
67 67
68#define STACKFRAMESIZE 256
69#define STK_REG(i) (112 + ((i)-14)*8)
70
68/* 71/*
69 * Computes the checksum of a memory block at buff, length len, 72 * Computes the checksum of a memory block at buff, length len,
70 * and adds in "sum" (32-bit). 73 * and adds in "sum" (32-bit).
71 * 74 *
72 * This code assumes at least halfword alignment, though the length
73 * can be any number of bytes. The sum is accumulated in r5.
74 *
75 * csum_partial(r3=buff, r4=len, r5=sum) 75 * csum_partial(r3=buff, r4=len, r5=sum)
76 */ 76 */
77_GLOBAL(csum_partial) 77_GLOBAL(csum_partial)
78 subi r3,r3,8 /* we'll offset by 8 for the loads */ 78 addic r0,r5,0 /* clear carry */
79 srdi. r6,r4,3 /* divide by 8 for doubleword count */ 79
80 addic r5,r5,0 /* clear carry */ 80 srdi. r6,r4,3 /* less than 8 bytes? */
81 beq 3f /* if we're doing < 8 bytes */ 81 beq .Lcsum_tail_word
82 andi. r0,r3,2 /* aligned on a word boundary already? */ 82
83 beq+ 1f 83 /*
84 lhz r6,8(r3) /* do 2 bytes to get aligned */ 84 * If only halfword aligned, align to a double word. Since odd
85 addi r3,r3,2 85 * aligned addresses should be rare and they would require more
86 subi r4,r4,2 86 * work to calculate the correct checksum, we ignore that case
87 addc r5,r5,r6 87 * and take the potential slowdown of unaligned loads.
88 srdi. r6,r4,3 /* recompute number of doublewords */ 88 */
89 beq 3f /* any left? */ 89 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
901: mtctr r6 90 beq .Lcsum_aligned
912: ldu r6,8(r3) /* main sum loop */ 91
92 adde r5,r5,r6 92 li r7,4
93 bdnz 2b 93 sub r6,r7,r6
94 andi. r4,r4,7 /* compute bytes left to sum after doublewords */ 94 mtctr r6
953: cmpwi 0,r4,4 /* is at least a full word left? */ 95
96 blt 4f 961:
97 lwz r6,8(r3) /* sum this word */ 97 lhz r6,0(r3) /* align to doubleword */
98 subi r4,r4,2
99 addi r3,r3,2
100 adde r0,r0,r6
101 bdnz 1b
102
103.Lcsum_aligned:
104 /*
105 * We unroll the loop such that each iteration is 64 bytes with an
106 * entry and exit limb of 64 bytes, meaning a minimum size of
107 * 128 bytes.
108 */
109 srdi. r6,r4,7
110 beq .Lcsum_tail_doublewords /* len < 128 */
111
112 srdi r6,r4,6
113 subi r6,r6,1
114 mtctr r6
115
116 stdu r1,-STACKFRAMESIZE(r1)
117 std r14,STK_REG(r14)(r1)
118 std r15,STK_REG(r15)(r1)
119 std r16,STK_REG(r16)(r1)
120
121 ld r6,0(r3)
122 ld r9,8(r3)
123
124 ld r10,16(r3)
125 ld r11,24(r3)
126
127 /*
128 * On POWER6 and POWER7 back to back addes take 2 cycles because of
129 * the XER dependency. This means the fastest this loop can go is
130 * 16 cycles per iteration. The scheduling of the loop below has
131 * been shown to hit this on both POWER6 and POWER7.
132 */
133 .align 5
1342:
135 adde r0,r0,r6
136 ld r12,32(r3)
137 ld r14,40(r3)
138
139 adde r0,r0,r9
140 ld r15,48(r3)
141 ld r16,56(r3)
142 addi r3,r3,64
143
144 adde r0,r0,r10
145
146 adde r0,r0,r11
147
148 adde r0,r0,r12
149
150 adde r0,r0,r14
151
152 adde r0,r0,r15
153 ld r6,0(r3)
154 ld r9,8(r3)
155
156 adde r0,r0,r16
157 ld r10,16(r3)
158 ld r11,24(r3)
159 bdnz 2b
160
161
162 adde r0,r0,r6
163 ld r12,32(r3)
164 ld r14,40(r3)
165
166 adde r0,r0,r9
167 ld r15,48(r3)
168 ld r16,56(r3)
169 addi r3,r3,64
170
171 adde r0,r0,r10
172 adde r0,r0,r11
173 adde r0,r0,r12
174 adde r0,r0,r14
175 adde r0,r0,r15
176 adde r0,r0,r16
177
178 ld r14,STK_REG(r14)(r1)
179 ld r15,STK_REG(r15)(r1)
180 ld r16,STK_REG(r16)(r1)
181 addi r1,r1,STACKFRAMESIZE
182
183 andi. r4,r4,63
184
185.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
186 srdi. r6,r4,3
187 beq .Lcsum_tail_word
188
189 mtctr r6
1903:
191 ld r6,0(r3)
192 addi r3,r3,8
193 adde r0,r0,r6
194 bdnz 3b
195
196 andi. r4,r4,7
197
198.Lcsum_tail_word: /* Up to 7 bytes to go */
199 srdi. r6,r4,2
200 beq .Lcsum_tail_halfword
201
202 lwz r6,0(r3)
98 addi r3,r3,4 203 addi r3,r3,4
204 adde r0,r0,r6
99 subi r4,r4,4 205 subi r4,r4,4
100 adde r5,r5,r6 206
1014: cmpwi 0,r4,2 /* is at least a halfword left? */ 207.Lcsum_tail_halfword: /* Up to 3 bytes to go */
102 blt+ 5f 208 srdi. r6,r4,1
103 lhz r6,8(r3) /* sum this halfword */ 209 beq .Lcsum_tail_byte
104 addi r3,r3,2 210
105 subi r4,r4,2 211 lhz r6,0(r3)
106 adde r5,r5,r6 212 addi r3,r3,2
1075: cmpwi 0,r4,1 /* is at least a byte left? */ 213 adde r0,r0,r6
108 bne+ 6f 214 subi r4,r4,2
109 lbz r6,8(r3) /* sum this byte */ 215
110 slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ 216.Lcsum_tail_byte: /* Up to 1 byte to go */
111 adde r5,r5,r6 217 andi. r6,r4,1
1126: addze r5,r5 /* add in final carry */ 218 beq .Lcsum_finish
113 rldicl r4,r5,32,0 /* fold two 32-bit halves together */ 219
114 add r3,r4,r5 220 lbz r6,0(r3)
115 srdi r3,r3,32 221 sldi r9,r6,8 /* Pad the byte out to 16 bits */
116 blr 222 adde r0,r0,r9
223
224.Lcsum_finish:
225 addze r0,r0 /* add in final carry */
226 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
227 add r3,r4,r0
228 srdi r3,r3,32
229 blr
230
231
232 .macro source
233100:
234 .section __ex_table,"a"
235 .align 3
236 .llong 100b,.Lsrc_error
237 .previous
238 .endm
239
240 .macro dest
241200:
242 .section __ex_table,"a"
243 .align 3
244 .llong 200b,.Ldest_error
245 .previous
246 .endm
117 247
118/* 248/*
119 * Computes the checksum of a memory block at src, length len, 249 * Computes the checksum of a memory block at src, length len,
120 * and adds in "sum" (32-bit), while copying the block to dst. 250 * and adds in "sum" (32-bit), while copying the block to dst.
121 * If an access exception occurs on src or dst, it stores -EFAULT 251 * If an access exception occurs on src or dst, it stores -EFAULT
122 * to *src_err or *dst_err respectively, and (for an error on 252 * to *src_err or *dst_err respectively. The caller must take any action
123 * src) zeroes the rest of dst. 253 * required in this case (zeroing memory, recalculating partial checksum etc).
124 *
125 * This code needs to be reworked to take advantage of 64 bit sum+copy.
126 * However, due to tokenring halfword alignment problems this will be very
127 * tricky. For now we'll leave it until we instrument it somehow.
128 * 254 *
129 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 255 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
130 */ 256 */
131_GLOBAL(csum_partial_copy_generic) 257_GLOBAL(csum_partial_copy_generic)
132 addic r0,r6,0 258 addic r0,r6,0 /* clear carry */
133 subi r3,r3,4 259
134 subi r4,r4,4 260 srdi. r6,r5,3 /* less than 8 bytes? */
135 srwi. r6,r5,2 261 beq .Lcopy_tail_word
136 beq 3f /* if we're doing < 4 bytes */ 262
137 andi. r9,r4,2 /* Align dst to longword boundary */ 263 /*
138 beq+ 1f 264 * If only halfword aligned, align to a double word. Since odd
13981: lhz r6,4(r3) /* do 2 bytes to get aligned */ 265 * aligned addresses should be rare and they would require more
140 addi r3,r3,2 266 * work to calculate the correct checksum, we ignore that case
267 * and take the potential slowdown of unaligned loads.
268 *
269 * If the source and destination are relatively unaligned we only
270 * align the source. This keeps things simple.
271 */
272 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
273 beq .Lcopy_aligned
274
275 li r7,4
276 sub r6,r7,r6
277 mtctr r6
278
2791:
280source; lhz r6,0(r3) /* align to doubleword */
141 subi r5,r5,2 281 subi r5,r5,2
14291: sth r6,4(r4)
143 addi r4,r4,2
144 addc r0,r0,r6
145 srwi. r6,r5,2 /* # words to do */
146 beq 3f
1471: mtctr r6
14882: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
14992: stwu r6,4(r4) /* be unnecessary to unroll this loop */
150 adde r0,r0,r6
151 bdnz 82b
152 andi. r5,r5,3
1533: cmpwi 0,r5,2
154 blt+ 4f
15583: lhz r6,4(r3)
156 addi r3,r3,2 282 addi r3,r3,2
157 subi r5,r5,2 283 adde r0,r0,r6
15893: sth r6,4(r4) 284dest; sth r6,0(r4)
159 addi r4,r4,2 285 addi r4,r4,2
286 bdnz 1b
287
288.Lcopy_aligned:
289 /*
290 * We unroll the loop such that each iteration is 64 bytes with an
291 * entry and exit limb of 64 bytes, meaning a minimum size of
292 * 128 bytes.
293 */
294 srdi. r6,r5,7
295 beq .Lcopy_tail_doublewords /* len < 128 */
296
297 srdi r6,r5,6
298 subi r6,r6,1
299 mtctr r6
300
301 stdu r1,-STACKFRAMESIZE(r1)
302 std r14,STK_REG(r14)(r1)
303 std r15,STK_REG(r15)(r1)
304 std r16,STK_REG(r16)(r1)
305
306source; ld r6,0(r3)
307source; ld r9,8(r3)
308
309source; ld r10,16(r3)
310source; ld r11,24(r3)
311
312 /*
313 * On POWER6 and POWER7 back to back addes take 2 cycles because of
314 * the XER dependency. This means the fastest this loop can go is
315 * 16 cycles per iteration. The scheduling of the loop below has
316 * been shown to hit this on both POWER6 and POWER7.
317 */
318 .align 5
3192:
160 adde r0,r0,r6 320 adde r0,r0,r6
1614: cmpwi 0,r5,1 321source; ld r12,32(r3)
162 bne+ 5f 322source; ld r14,40(r3)
16384: lbz r6,4(r3) 323
16494: stb r6,4(r4) 324 adde r0,r0,r9
165 slwi r6,r6,8 /* Upper byte of word */ 325source; ld r15,48(r3)
326source; ld r16,56(r3)
327 addi r3,r3,64
328
329 adde r0,r0,r10
330dest; std r6,0(r4)
331dest; std r9,8(r4)
332
333 adde r0,r0,r11
334dest; std r10,16(r4)
335dest; std r11,24(r4)
336
337 adde r0,r0,r12
338dest; std r12,32(r4)
339dest; std r14,40(r4)
340
341 adde r0,r0,r14
342dest; std r15,48(r4)
343dest; std r16,56(r4)
344 addi r4,r4,64
345
346 adde r0,r0,r15
347source; ld r6,0(r3)
348source; ld r9,8(r3)
349
350 adde r0,r0,r16
351source; ld r10,16(r3)
352source; ld r11,24(r3)
353 bdnz 2b
354
355
166 adde r0,r0,r6 356 adde r0,r0,r6
1675: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ 357source; ld r12,32(r3)
168 rldicl r4,r3,32,0 /* fold 64 bit value */ 358source; ld r14,40(r3)
169 add r3,r4,r3
170 srdi r3,r3,32
171 blr
172 359
173/* These shouldn't go in the fixup section, since that would 360 adde r0,r0,r9
174 cause the ex_table addresses to get out of order. */ 361source; ld r15,48(r3)
362source; ld r16,56(r3)
363 addi r3,r3,64
364
365 adde r0,r0,r10
366dest; std r6,0(r4)
367dest; std r9,8(r4)
368
369 adde r0,r0,r11
370dest; std r10,16(r4)
371dest; std r11,24(r4)
372
373 adde r0,r0,r12
374dest; std r12,32(r4)
375dest; std r14,40(r4)
376
377 adde r0,r0,r14
378dest; std r15,48(r4)
379dest; std r16,56(r4)
380 addi r4,r4,64
381
382 adde r0,r0,r15
383 adde r0,r0,r16
384
385 ld r14,STK_REG(r14)(r1)
386 ld r15,STK_REG(r15)(r1)
387 ld r16,STK_REG(r16)(r1)
388 addi r1,r1,STACKFRAMESIZE
389
390 andi. r5,r5,63
391
392.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
393 srdi. r6,r5,3
394 beq .Lcopy_tail_word
175 395
176 .globl src_error_1
177src_error_1:
178 li r6,0
179 subi r5,r5,2
18095: sth r6,4(r4)
181 addi r4,r4,2
182 srwi. r6,r5,2
183 beq 3f
184 mtctr r6 396 mtctr r6
185 .globl src_error_2 3973:
186src_error_2: 398source; ld r6,0(r3)
187 li r6,0 399 addi r3,r3,8
18896: stwu r6,4(r4) 400 adde r0,r0,r6
189 bdnz 96b 401dest; std r6,0(r4)
1903: andi. r5,r5,3 402 addi r4,r4,8
191 beq src_error 403 bdnz 3b
192 .globl src_error_3 404
193src_error_3: 405 andi. r5,r5,7
194 li r6,0 406
195 mtctr r5 407.Lcopy_tail_word: /* Up to 7 bytes to go */
196 addi r4,r4,3 408 srdi. r6,r5,2
19797: stbu r6,1(r4) 409 beq .Lcopy_tail_halfword
198 bdnz 97b 410
199 .globl src_error 411source; lwz r6,0(r3)
200src_error: 412 addi r3,r3,4
413 adde r0,r0,r6
414dest; stw r6,0(r4)
415 addi r4,r4,4
416 subi r5,r5,4
417
418.Lcopy_tail_halfword: /* Up to 3 bytes to go */
419 srdi. r6,r5,1
420 beq .Lcopy_tail_byte
421
422source; lhz r6,0(r3)
423 addi r3,r3,2
424 adde r0,r0,r6
425dest; sth r6,0(r4)
426 addi r4,r4,2
427 subi r5,r5,2
428
429.Lcopy_tail_byte: /* Up to 1 byte to go */
430 andi. r6,r5,1
431 beq .Lcopy_finish
432
433source; lbz r6,0(r3)
434 sldi r9,r6,8 /* Pad the byte out to 16 bits */
435 adde r0,r0,r9
436dest; stb r6,0(r4)
437
438.Lcopy_finish:
439 addze r0,r0 /* add in final carry */
440 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
441 add r3,r4,r0
442 srdi r3,r3,32
443 blr
444
445.Lsrc_error:
201 cmpdi 0,r7,0 446 cmpdi 0,r7,0
202 beq 1f 447 beqlr
203 li r6,-EFAULT 448 li r6,-EFAULT
204 stw r6,0(r7) 449 stw r6,0(r7)
2051: addze r3,r0
206 blr 450 blr
207 451
208 .globl dst_error 452.Ldest_error:
209dst_error:
210 cmpdi 0,r8,0 453 cmpdi 0,r8,0
211 beq 1f 454 beqlr
212 li r6,-EFAULT 455 li r6,-EFAULT
213 stw r6,0(r8) 456 stw r6,0(r8)
2141: addze r3,r0
215 blr 457 blr
216
217.section __ex_table,"a"
218 .align 3
219 .llong 81b,src_error_1
220 .llong 91b,dst_error
221 .llong 82b,src_error_2
222 .llong 92b,dst_error
223 .llong 83b,src_error_3
224 .llong 93b,dst_error
225 .llong 84b,src_error_3
226 .llong 94b,dst_error
227 .llong 95b,dst_error
228 .llong 96b,dst_error
229 .llong 97b,dst_error