aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc/lib/checksum_64.S
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2010-08-02 16:09:52 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2010-09-02 00:07:30 -0400
commitfdd374b62ca4df144c0138359dcffa83df7a0ea8 (patch)
tree8f52a7648adb30012b01589892b71913cdbc4cd7 /arch/powerpc/lib/checksum_64.S
parent9b83ecb0a3cf1bf7ecf84359ddcfb9dd49646bf2 (diff)
powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user
We use the same core loop as the new csum_partial, adding in the stores and exception handling code. To keep things simple we do all the exception fixup in csum_and_copy_from_user. This wrapper function is modelled on the generic checksum code and is careful to always calculate a complete checksum even if we only copied part of the data to userspace. To test this I forced checksumming on over loopback and ran socklib (a simple TCP benchmark). On a POWER6 575 throughput improved by 19% with this patch. If I forced both the sender and receiver onto the same cpu (with the hope of shifting the benchmark from being cache bandwidth limited to cpu limited), adding this patch improved performance by 55% Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib/checksum_64.S')
-rw-r--r--arch/powerpc/lib/checksum_64.S289
1 files changed, 202 insertions, 87 deletions
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 404d5a6e3387..18245af38aea 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -228,115 +228,230 @@ _GLOBAL(csum_partial)
228 srdi r3,r3,32 228 srdi r3,r3,32
229 blr 229 blr
230 230
231
232 .macro source
233100:
234 .section __ex_table,"a"
235 .align 3
236 .llong 100b,.Lsrc_error
237 .previous
238 .endm
239
240 .macro dest
241200:
242 .section __ex_table,"a"
243 .align 3
244 .llong 200b,.Ldest_error
245 .previous
246 .endm
247
231/* 248/*
232 * Computes the checksum of a memory block at src, length len, 249 * Computes the checksum of a memory block at src, length len,
233 * and adds in "sum" (32-bit), while copying the block to dst. 250 * and adds in "sum" (32-bit), while copying the block to dst.
234 * If an access exception occurs on src or dst, it stores -EFAULT 251 * If an access exception occurs on src or dst, it stores -EFAULT
235 * to *src_err or *dst_err respectively, and (for an error on 252 * to *src_err or *dst_err respectively. The caller must take any action
236 * src) zeroes the rest of dst. 253 * required in this case (zeroing memory, recalculating partial checksum etc).
237 *
238 * This code needs to be reworked to take advantage of 64 bit sum+copy.
239 * However, due to tokenring halfword alignment problems this will be very
240 * tricky. For now we'll leave it until we instrument it somehow.
241 * 254 *
242 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) 255 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
243 */ 256 */
244_GLOBAL(csum_partial_copy_generic) 257_GLOBAL(csum_partial_copy_generic)
245 addic r0,r6,0 258 addic r0,r6,0 /* clear carry */
246 subi r3,r3,4 259
247 subi r4,r4,4 260 srdi. r6,r5,3 /* less than 8 bytes? */
248 srwi. r6,r5,2 261 beq .Lcopy_tail_word
249 beq 3f /* if we're doing < 4 bytes */ 262
250 andi. r9,r4,2 /* Align dst to longword boundary */ 263 /*
251 beq+ 1f 264 * If only halfword aligned, align to a double word. Since odd
25281: lhz r6,4(r3) /* do 2 bytes to get aligned */ 265 * aligned addresses should be rare and they would require more
253 addi r3,r3,2 266 * work to calculate the correct checksum, we ignore that case
267 * and take the potential slowdown of unaligned loads.
268 *
269 * If the source and destination are relatively unaligned we only
270 * align the source. This keeps things simple.
271 */
272 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
273 beq .Lcopy_aligned
274
275 li r7,4
276 sub r6,r7,r6
277 mtctr r6
278
2791:
280source; lhz r6,0(r3) /* align to doubleword */
254 subi r5,r5,2 281 subi r5,r5,2
25591: sth r6,4(r4)
256 addi r4,r4,2
257 addc r0,r0,r6
258 srwi. r6,r5,2 /* # words to do */
259 beq 3f
2601: mtctr r6
26182: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
26292: stwu r6,4(r4) /* be unnecessary to unroll this loop */
263 adde r0,r0,r6
264 bdnz 82b
265 andi. r5,r5,3
2663: cmpwi 0,r5,2
267 blt+ 4f
26883: lhz r6,4(r3)
269 addi r3,r3,2 282 addi r3,r3,2
270 subi r5,r5,2 283 adde r0,r0,r6
27193: sth r6,4(r4) 284dest; sth r6,0(r4)
272 addi r4,r4,2 285 addi r4,r4,2
286 bdnz 1b
287
288.Lcopy_aligned:
289 /*
290 * We unroll the loop such that each iteration is 64 bytes with an
291 * entry and exit limb of 64 bytes, meaning a minimum size of
292 * 128 bytes.
293 */
294 srdi. r6,r5,7
295 beq .Lcopy_tail_doublewords /* len < 128 */
296
297 srdi r6,r5,6
298 subi r6,r6,1
299 mtctr r6
300
301 stdu r1,-STACKFRAMESIZE(r1)
302 std r14,STK_REG(r14)(r1)
303 std r15,STK_REG(r15)(r1)
304 std r16,STK_REG(r16)(r1)
305
306source; ld r6,0(r3)
307source; ld r9,8(r3)
308
309source; ld r10,16(r3)
310source; ld r11,24(r3)
311
312 /*
313 * On POWER6 and POWER7 back to back addes take 2 cycles because of
314 * the XER dependency. This means the fastest this loop can go is
315 * 16 cycles per iteration. The scheduling of the loop below has
316 * been shown to hit this on both POWER6 and POWER7.
317 */
318 .align 5
3192:
320 adde r0,r0,r6
321source; ld r12,32(r3)
322source; ld r14,40(r3)
323
324 adde r0,r0,r9
325source; ld r15,48(r3)
326source; ld r16,56(r3)
327 addi r3,r3,64
328
329 adde r0,r0,r10
330dest; std r6,0(r4)
331dest; std r9,8(r4)
332
333 adde r0,r0,r11
334dest; std r10,16(r4)
335dest; std r11,24(r4)
336
337 adde r0,r0,r12
338dest; std r12,32(r4)
339dest; std r14,40(r4)
340
341 adde r0,r0,r14
342dest; std r15,48(r4)
343dest; std r16,56(r4)
344 addi r4,r4,64
345
346 adde r0,r0,r15
347source; ld r6,0(r3)
348source; ld r9,8(r3)
349
350 adde r0,r0,r16
351source; ld r10,16(r3)
352source; ld r11,24(r3)
353 bdnz 2b
354
355
273 adde r0,r0,r6 356 adde r0,r0,r6
2744: cmpwi 0,r5,1 357source; ld r12,32(r3)
275 bne+ 5f 358source; ld r14,40(r3)
27684: lbz r6,4(r3) 359
27794: stb r6,4(r4) 360 adde r0,r0,r9
278 slwi r6,r6,8 /* Upper byte of word */ 361source; ld r15,48(r3)
362source; ld r16,56(r3)
363 addi r3,r3,64
364
365 adde r0,r0,r10
366dest; std r6,0(r4)
367dest; std r9,8(r4)
368
369 adde r0,r0,r11
370dest; std r10,16(r4)
371dest; std r11,24(r4)
372
373 adde r0,r0,r12
374dest; std r12,32(r4)
375dest; std r14,40(r4)
376
377 adde r0,r0,r14
378dest; std r15,48(r4)
379dest; std r16,56(r4)
380 addi r4,r4,64
381
382 adde r0,r0,r15
383 adde r0,r0,r16
384
385 ld r14,STK_REG(r14)(r1)
386 ld r15,STK_REG(r15)(r1)
387 ld r16,STK_REG(r16)(r1)
388 addi r1,r1,STACKFRAMESIZE
389
390 andi. r5,r5,63
391
392.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
393 srdi. r6,r5,3
394 beq .Lcopy_tail_word
395
396 mtctr r6
3973:
398source; ld r6,0(r3)
399 addi r3,r3,8
279 adde r0,r0,r6 400 adde r0,r0,r6
2805: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ 401dest; std r6,0(r4)
281 rldicl r4,r3,32,0 /* fold 64 bit value */ 402 addi r4,r4,8
282 add r3,r4,r3 403 bdnz 3b
283 srdi r3,r3,32
284 blr
285 404
286/* These shouldn't go in the fixup section, since that would 405 andi. r5,r5,7
287 cause the ex_table addresses to get out of order. */
288 406
289 .globl src_error_1 407.Lcopy_tail_word: /* Up to 7 bytes to go */
290src_error_1: 408 srdi. r6,r5,2
291 li r6,0 409 beq .Lcopy_tail_halfword
292 subi r5,r5,2 410
29395: sth r6,4(r4) 411source; lwz r6,0(r3)
412 addi r3,r3,4
413 adde r0,r0,r6
414dest; stw r6,0(r4)
415 addi r4,r4,4
416 subi r5,r5,4
417
418.Lcopy_tail_halfword: /* Up to 3 bytes to go */
419 srdi. r6,r5,1
420 beq .Lcopy_tail_byte
421
422source; lhz r6,0(r3)
423 addi r3,r3,2
424 adde r0,r0,r6
425dest; sth r6,0(r4)
294 addi r4,r4,2 426 addi r4,r4,2
295 srwi. r6,r5,2 427 subi r5,r5,2
296 beq 3f 428
297 mtctr r6 429.Lcopy_tail_byte: /* Up to 1 byte to go */
298 .globl src_error_2 430 andi. r6,r5,1
299src_error_2: 431 beq .Lcopy_finish
300 li r6,0 432
30196: stwu r6,4(r4) 433source; lbz r6,0(r3)
302 bdnz 96b 434 sldi r9,r6,8 /* Pad the byte out to 16 bits */
3033: andi. r5,r5,3 435 adde r0,r0,r9
304 beq src_error 436dest; stb r6,0(r4)
305 .globl src_error_3 437
306src_error_3: 438.Lcopy_finish:
307 li r6,0 439 addze r0,r0 /* add in final carry */
308 mtctr r5 440 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
309 addi r4,r4,3 441 add r3,r4,r0
31097: stbu r6,1(r4) 442 srdi r3,r3,32
311 bdnz 97b 443 blr
312 .globl src_error 444
313src_error: 445.Lsrc_error:
314 cmpdi 0,r7,0 446 cmpdi 0,r7,0
315 beq 1f 447 beqlr
316 li r6,-EFAULT 448 li r6,-EFAULT
317 stw r6,0(r7) 449 stw r6,0(r7)
3181: addze r3,r0
319 blr 450 blr
320 451
321 .globl dst_error 452.Ldest_error:
322dst_error:
323 cmpdi 0,r8,0 453 cmpdi 0,r8,0
324 beq 1f 454 beqlr
325 li r6,-EFAULT 455 li r6,-EFAULT
326 stw r6,0(r8) 456 stw r6,0(r8)
3271: addze r3,r0
328 blr 457 blr
329
330.section __ex_table,"a"
331 .align 3
332 .llong 81b,src_error_1
333 .llong 91b,dst_error
334 .llong 82b,src_error_2
335 .llong 92b,dst_error
336 .llong 83b,src_error_3
337 .llong 93b,dst_error
338 .llong 84b,src_error_3
339 .llong 94b,dst_error
340 .llong 95b,dst_error
341 .llong 96b,dst_error
342 .llong 97b,dst_error