powerpc: Optimise 64bit csum_partial_copy_generic and add csum_and_copy_from_user

We use the same core loop as the new csum_partial, adding in the stores and exception handling code. To keep things simple we do all the exception fixup in csum_and_copy_from_user. This wrapper function is modelled on the generic checksum code and is careful to always calculate a complete checksum even if we only copied part of the data to userspace. To test this I forced checksumming on over loopback and ran socklib (a simple TCP benchmark). On a POWER6 575 throughput improved by 19% with this patch. If I forced both the sender and receiver onto the same cpu (with the hope of shifting the benchmark from being cache bandwidth limited to cpu limited), adding this patch improved performance by 55% Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
author: Anton Blanchard <anton@samba.org> 2010-08-02 16:09:52 -0400
committer: Benjamin Herrenschmidt <benh@kernel.crashing.org> 2010-09-02 00:07:30 -0400
commit: fdd374b62ca4df144c0138359dcffa83df7a0ea8 (patch)
tree: 8f52a7648adb30012b01589892b71913cdbc4cd7 /arch/powerpc/lib/checksum_64.S
parent: 9b83ecb0a3cf1bf7ecf84359ddcfb9dd49646bf2 (diff)
1 files changed, 202 insertions, 87 deletions
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index 404d5a6e3387..18245af38aea 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -228,115 +228,230 @@ _GLOBAL(csum_partial)
        srdi    r3,r3,32
        blr
+        .macro source
+100:
+        .section __ex_table,"a"
+        .align 3
+        .llong 100b,.Lsrc_error
+        .previous
+        .endm
+        .macro dest
+200:
+        .section __ex_table,"a"
+        .align 3
+        .llong 200b,.Ldest_error
+        .previous
+        .endm
 /*
 * Computes the checksum of a memory block at src, length len,
 * and adds in "sum" (32-bit), while copying the block to dst.
 * If an access exception occurs on src or dst, it stores -EFAULT
- * to *src_err or *dst_err respectively, and (for an error on
+ * to *src_err or *dst_err respectively. The caller must take any action
- * src) zeroes the rest of dst.
+ * required in this case (zeroing memory, recalculating partial checksum etc).
- *
- * This code needs to be reworked to take advantage of 64 bit sum+copy.
- * However, due to tokenring halfword alignment problems this will be very
- * tricky.  For now we'll leave it until we instrument it somehow.
 *
 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 */
 _GLOBAL(csum_partial_copy_generic)
-        addic   r0,r6,0
+        addic   r0,r6,0                 /* clear carry */
-        subi    r3,r3,4
-        subi    r4,r4,4
+        srdi.   r6,r5,3                 /* less than 8 bytes? */
-        srwi.   r6,r5,2
+        beq     .Lcopy_tail_word
-        beq     3f              /* if we're doing < 4 bytes */
-        andi.   r9,r4,2         /* Align dst to longword boundary */
+        /*
-        beq+    1f
+         * If only halfword aligned, align to a double word. Since odd
-81:     lhz     r6,4(r3)        /* do 2 bytes to get aligned */
+         * aligned addresses should be rare and they would require more
-        addi    r3,r3,2
+         * work to calculate the correct checksum, we ignore that case
+         * and take the potential slowdown of unaligned loads.
+         *
+         * If the source and destination are relatively unaligned we only
+         * align the source. This keeps things simple.
+         */
+        rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
+        beq     .Lcopy_aligned
+        li      r7,4
+        sub     r6,r7,r6
+        mtctr   r6
+1:
+source; lhz     r6,0(r3)                /* align to doubleword */
        subi    r5,r5,2
-91:     sth     r6,4(r4)
-        addi    r4,r4,2
-        addc    r0,r0,r6
-        srwi.   r6,r5,2         /* # words to do */
-        beq     3f
-1:      mtctr   r6
-82:     lwzu    r6,4(r3)        /* the bdnz has zero overhead, so it should */
-92:     stwu    r6,4(r4)        /* be unnecessary to unroll this loop */
-        adde    r0,r0,r6
-        bdnz    82b
-        andi.   r5,r5,3
-3:      cmpwi   0,r5,2
-        blt+    4f
-83:     lhz     r6,4(r3)
        addi    r3,r3,2
-        subi    r5,r5,2
+        adde    r0,r0,r6
-93:     sth     r6,4(r4)
+dest;   sth     r6,0(r4)
        addi    r4,r4,2
+        bdnz    1b
+.Lcopy_aligned:
+        /*
+         * We unroll the loop such that each iteration is 64 bytes with an
+         * entry and exit limb of 64 bytes, meaning a minimum size of
+         * 128 bytes.
+         */
+        srdi.   r6,r5,7
+        beq     .Lcopy_tail_doublewords         /* len < 128 */
+        srdi    r6,r5,6
+        subi    r6,r6,1
+        mtctr   r6
+        stdu    r1,-STACKFRAMESIZE(r1)
+        std     r14,STK_REG(r14)(r1)
+        std     r15,STK_REG(r15)(r1)
+        std     r16,STK_REG(r16)(r1)
+source; ld      r6,0(r3)
+source; ld      r9,8(r3)
+source; ld      r10,16(r3)
+source; ld      r11,24(r3)
+        /*
+         * On POWER6 and POWER7 back to back addes take 2 cycles because of
+         * the XER dependency. This means the fastest this loop can go is
+         * 16 cycles per iteration. The scheduling of the loop below has
+         * been shown to hit this on both POWER6 and POWER7.
+         */
+        .align 5
+2:
+        adde    r0,r0,r6
+source; ld      r12,32(r3)
+source; ld      r14,40(r3)
+        adde    r0,r0,r9
+source; ld      r15,48(r3)
+source; ld      r16,56(r3)
+        addi    r3,r3,64
+        adde    r0,r0,r10
+dest;   std     r6,0(r4)
+dest;   std     r9,8(r4)
+        adde    r0,r0,r11
+dest;   std     r10,16(r4)
+dest;   std     r11,24(r4)
+        adde    r0,r0,r12
+dest;   std     r12,32(r4)
+dest;   std     r14,40(r4)
+        adde    r0,r0,r14
+dest;   std     r15,48(r4)
+dest;   std     r16,56(r4)
+        addi    r4,r4,64
+        adde    r0,r0,r15
+source; ld      r6,0(r3)
+source; ld      r9,8(r3)
+        adde    r0,r0,r16
+source; ld      r10,16(r3)
+source; ld      r11,24(r3)
+        bdnz    2b
        adde    r0,r0,r6
-4:      cmpwi   0,r5,1
+source; ld      r12,32(r3)
-        bne+    5f
+source; ld      r14,40(r3)
-84:     lbz     r6,4(r3)
-94:     stb     r6,4(r4)
+        adde    r0,r0,r9
-        slwi    r6,r6,8         /* Upper byte of word */
+source; ld      r15,48(r3)
+source; ld      r16,56(r3)
+        addi    r3,r3,64
+        adde    r0,r0,r10
+dest;   std     r6,0(r4)
+dest;   std     r9,8(r4)
+        adde    r0,r0,r11
+dest;   std     r10,16(r4)
+dest;   std     r11,24(r4)
+        adde    r0,r0,r12
+dest;   std     r12,32(r4)
+dest;   std     r14,40(r4)
+        adde    r0,r0,r14
+dest;   std     r15,48(r4)
+dest;   std     r16,56(r4)
+        addi    r4,r4,64
+        adde    r0,r0,r15
+        adde    r0,r0,r16
+        ld      r14,STK_REG(r14)(r1)
+        ld      r15,STK_REG(r15)(r1)
+        ld      r16,STK_REG(r16)(r1)
+        addi    r1,r1,STACKFRAMESIZE
+        andi.   r5,r5,63
+.Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
+        srdi.   r6,r5,3
+        beq     .Lcopy_tail_word
+        mtctr   r6
+3:
+source; ld      r6,0(r3)
+        addi    r3,r3,8
        adde    r0,r0,r6
-5:      addze   r3,r0           /* add in final carry (unlikely with 64-bit regs) */
+dest;   std     r6,0(r4)
-        rldicl  r4,r3,32,0      /* fold 64 bit value */
+        addi    r4,r4,8
-        add     r3,r4,r3
+        bdnz    3b
-        srdi    r3,r3,32
-        blr
-/* These shouldn't go in the fixup section, since that would
+        andi.   r5,r5,7
-   cause the ex_table addresses to get out of order. */
-        .globl src_error_1
+.Lcopy_tail_word:                       /* Up to 7 bytes to go */
-src_error_1:
+        srdi.   r6,r5,2
-        li      r6,0
+        beq     .Lcopy_tail_halfword
-        subi    r5,r5,2
-95:     sth     r6,4(r4)
+source; lwz     r6,0(r3)
+        addi    r3,r3,4
+        adde    r0,r0,r6
+dest;   stw     r6,0(r4)
+        addi    r4,r4,4
+        subi    r5,r5,4
+.Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
+        srdi.   r6,r5,1
+        beq     .Lcopy_tail_byte
+source; lhz     r6,0(r3)
+        addi    r3,r3,2
+        adde    r0,r0,r6
+dest;   sth     r6,0(r4)
        addi    r4,r4,2
-        srwi.   r6,r5,2
+        subi    r5,r5,2
-        beq     3f
-        mtctr   r6
+.Lcopy_tail_byte:                       /* Up to 1 byte to go */
-        .globl src_error_2
+        andi.   r6,r5,1
-src_error_2:
+        beq     .Lcopy_finish
-        li      r6,0
-96:     stwu    r6,4(r4)
+source; lbz     r6,0(r3)
-        bdnz    96b
+        sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
-3:      andi.   r5,r5,3
+        adde    r0,r0,r9
-        beq     src_error
+dest;   stb     r6,0(r4)
-        .globl src_error_3
-src_error_3:
+.Lcopy_finish:
-        li      r6,0
+        addze   r0,r0                   /* add in final carry */
-        mtctr   r5
+        rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
-        addi    r4,r4,3
+        add     r3,r4,r0
-97:     stbu    r6,1(r4)
+        srdi    r3,r3,32
-        bdnz    97b
+        blr
-        .globl src_error
-src_error:
+.Lsrc_error:
        cmpdi   0,r7,0
-        beq     1f
+        beqlr
        li      r6,-EFAULT
        stw     r6,0(r7)
-1:      addze   r3,r0
        blr
-        .globl dst_error
+.Ldest_error:
-dst_error:
        cmpdi   0,r8,0
-        beq     1f
+        beqlr
        li      r6,-EFAULT
        stw     r6,0(r8)
-1:      addze   r3,r0
        blr
-.section __ex_table,"a"
-        .align  3
-        .llong  81b,src_error_1
-        .llong  91b,dst_error
-        .llong  82b,src_error_2
-        .llong  92b,dst_error
-        .llong  83b,src_error_3
-        .llong  93b,dst_error
-        .llong  84b,src_error_3
-        .llong  94b,dst_error
-        .llong  95b,dst_error
-        .llong  96b,dst_error
-        .llong  97b,dst_error
author	Anton Blanchard <anton@samba.org>	2010-08-02 16:09:52 -0400
committer	Benjamin Herrenschmidt <benh@kernel.crashing.org>	2010-09-02 00:07:30 -0400
commit	fdd374b62ca4df144c0138359dcffa83df7a0ea8 (patch)
tree	8f52a7648adb30012b01589892b71913cdbc4cd7 /arch/powerpc/lib/checksum_64.S
parent	9b83ecb0a3cf1bf7ecf84359ddcfb9dd49646bf2 (diff)

diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S index 404d5a6e3387..18245af38aea 100644 --- a/arch/powerpc/lib/checksum_64.S +++ b/arch/powerpc/lib/checksum_64.S
@@ -228,115 +228,230 @@ _GLOBAL(csum_partial)
228	srdi r3,r3,32	228	srdi r3,r3,32
229	blr	229	blr
230		230
		231
		232	.macro source
		233	100:
		234	.section __ex_table,"a"
		235	.align 3
		236	.llong 100b,.Lsrc_error
		237	.previous
		238	.endm
		239
		240	.macro dest
		241	200:
		242	.section __ex_table,"a"
		243	.align 3
		244	.llong 200b,.Ldest_error
		245	.previous
		246	.endm
		247
231	/*	248	/*
232	* Computes the checksum of a memory block at src, length len,	249	* Computes the checksum of a memory block at src, length len,
233	* and adds in "sum" (32-bit), while copying the block to dst.	250	* and adds in "sum" (32-bit), while copying the block to dst.
234	* If an access exception occurs on src or dst, it stores -EFAULT	251	* If an access exception occurs on src or dst, it stores -EFAULT
235	* to src_err or dst_err respectively, and (for an error on	252	* to src_err or dst_err respectively. The caller must take any action
236	* src) zeroes the rest of dst.	253	* required in this case (zeroing memory, recalculating partial checksum etc).
237	*
238	* This code needs to be reworked to take advantage of 64 bit sum+copy.
239	* However, due to tokenring halfword alignment problems this will be very
240	* tricky. For now we'll leave it until we instrument it somehow.
241	*	254	*
242	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)	255	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
243	*/	256	*/
244	_GLOBAL(csum_partial_copy_generic)	257	_GLOBAL(csum_partial_copy_generic)
245	addic r0,r6,0	258	addic r0,r6,0 /* clear carry */
246	subi r3,r3,4	259
247	subi r4,r4,4	260	srdi. r6,r5,3 /* less than 8 bytes? */
248	srwi. r6,r5,2	261	beq .Lcopy_tail_word
249	beq 3f /* if we're doing < 4 bytes */	262
250	andi. r9,r4,2 /* Align dst to longword boundary */	263	/*
251	beq+ 1f	264	* If only halfword aligned, align to a double word. Since odd
252	81: lhz r6,4(r3) /* do 2 bytes to get aligned */	265	* aligned addresses should be rare and they would require more
253	addi r3,r3,2	266	* work to calculate the correct checksum, we ignore that case
		267	* and take the potential slowdown of unaligned loads.
		268	*
		269	* If the source and destination are relatively unaligned we only
		270	* align the source. This keeps things simple.
		271	*/
		272	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
		273	beq .Lcopy_aligned
		274
		275	li r7,4
		276	sub r6,r7,r6
		277	mtctr r6
		278
		279	1:
		280	source; lhz r6,0(r3) /* align to doubleword */
254	subi r5,r5,2	281	subi r5,r5,2
255	91: sth r6,4(r4)
256	addi r4,r4,2
257	addc r0,r0,r6
258	srwi. r6,r5,2 /* # words to do */
259	beq 3f
260	1: mtctr r6
261	82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
262	92: stwu r6,4(r4) /* be unnecessary to unroll this loop */
263	adde r0,r0,r6
264	bdnz 82b
265	andi. r5,r5,3
266	3: cmpwi 0,r5,2
267	blt+ 4f
268	83: lhz r6,4(r3)
269	addi r3,r3,2	282	addi r3,r3,2
270	subi r5,r5,2	283	adde r0,r0,r6
271	93: sth r6,4(r4)	284	dest; sth r6,0(r4)
272	addi r4,r4,2	285	addi r4,r4,2
		286	bdnz 1b
		287
		288	.Lcopy_aligned:
		289	/*
		290	* We unroll the loop such that each iteration is 64 bytes with an
		291	* entry and exit limb of 64 bytes, meaning a minimum size of
		292	* 128 bytes.
		293	*/
		294	srdi. r6,r5,7
		295	beq .Lcopy_tail_doublewords /* len < 128 */
		296
		297	srdi r6,r5,6
		298	subi r6,r6,1
		299	mtctr r6
		300
		301	stdu r1,-STACKFRAMESIZE(r1)
		302	std r14,STK_REG(r14)(r1)
		303	std r15,STK_REG(r15)(r1)
		304	std r16,STK_REG(r16)(r1)
		305
		306	source; ld r6,0(r3)
		307	source; ld r9,8(r3)
		308
		309	source; ld r10,16(r3)
		310	source; ld r11,24(r3)
		311
		312	/*
		313	* On POWER6 and POWER7 back to back addes take 2 cycles because of
		314	* the XER dependency. This means the fastest this loop can go is
		315	* 16 cycles per iteration. The scheduling of the loop below has
		316	* been shown to hit this on both POWER6 and POWER7.
		317	*/
		318	.align 5
		319	2:
		320	adde r0,r0,r6
		321	source; ld r12,32(r3)
		322	source; ld r14,40(r3)
		323
		324	adde r0,r0,r9
		325	source; ld r15,48(r3)
		326	source; ld r16,56(r3)
		327	addi r3,r3,64
		328
		329	adde r0,r0,r10
		330	dest; std r6,0(r4)
		331	dest; std r9,8(r4)
		332
		333	adde r0,r0,r11
		334	dest; std r10,16(r4)
		335	dest; std r11,24(r4)
		336
		337	adde r0,r0,r12
		338	dest; std r12,32(r4)
		339	dest; std r14,40(r4)
		340
		341	adde r0,r0,r14
		342	dest; std r15,48(r4)
		343	dest; std r16,56(r4)
		344	addi r4,r4,64
		345
		346	adde r0,r0,r15
		347	source; ld r6,0(r3)
		348	source; ld r9,8(r3)
		349
		350	adde r0,r0,r16
		351	source; ld r10,16(r3)
		352	source; ld r11,24(r3)
		353	bdnz 2b
		354
		355
273	adde r0,r0,r6	356	adde r0,r0,r6
274	4: cmpwi 0,r5,1	357	source; ld r12,32(r3)
275	bne+ 5f	358	source; ld r14,40(r3)
276	84: lbz r6,4(r3)	359
277	94: stb r6,4(r4)	360	adde r0,r0,r9
278	slwi r6,r6,8 /* Upper byte of word */	361	source; ld r15,48(r3)
		362	source; ld r16,56(r3)
		363	addi r3,r3,64
		364
		365	adde r0,r0,r10
		366	dest; std r6,0(r4)
		367	dest; std r9,8(r4)
		368
		369	adde r0,r0,r11
		370	dest; std r10,16(r4)
		371	dest; std r11,24(r4)
		372
		373	adde r0,r0,r12
		374	dest; std r12,32(r4)
		375	dest; std r14,40(r4)
		376
		377	adde r0,r0,r14
		378	dest; std r15,48(r4)
		379	dest; std r16,56(r4)
		380	addi r4,r4,64
		381
		382	adde r0,r0,r15
		383	adde r0,r0,r16
		384
		385	ld r14,STK_REG(r14)(r1)
		386	ld r15,STK_REG(r15)(r1)
		387	ld r16,STK_REG(r16)(r1)
		388	addi r1,r1,STACKFRAMESIZE
		389
		390	andi. r5,r5,63
		391
		392	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
		393	srdi. r6,r5,3
		394	beq .Lcopy_tail_word
		395
		396	mtctr r6
		397	3:
		398	source; ld r6,0(r3)
		399	addi r3,r3,8
279	adde r0,r0,r6	400	adde r0,r0,r6
280	5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */	401	dest; std r6,0(r4)
281	rldicl r4,r3,32,0 /* fold 64 bit value */	402	addi r4,r4,8
282	add r3,r4,r3	403	bdnz 3b
283	srdi r3,r3,32
284	blr
285		404
286	/* These shouldn't go in the fixup section, since that would	405	andi. r5,r5,7
287	cause the ex_table addresses to get out of order. */
288		406
289	.globl src_error_1	407	.Lcopy_tail_word: /* Up to 7 bytes to go */
290	src_error_1:	408	srdi. r6,r5,2
291	li r6,0	409	beq .Lcopy_tail_halfword
292	subi r5,r5,2	410
293	95: sth r6,4(r4)	411	source; lwz r6,0(r3)
		412	addi r3,r3,4
		413	adde r0,r0,r6
		414	dest; stw r6,0(r4)
		415	addi r4,r4,4
		416	subi r5,r5,4
		417
		418	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
		419	srdi. r6,r5,1
		420	beq .Lcopy_tail_byte
		421
		422	source; lhz r6,0(r3)
		423	addi r3,r3,2
		424	adde r0,r0,r6
		425	dest; sth r6,0(r4)
294	addi r4,r4,2	426	addi r4,r4,2
295	srwi. r6,r5,2	427	subi r5,r5,2
296	beq 3f	428
297	mtctr r6	429	.Lcopy_tail_byte: /* Up to 1 byte to go */
298	.globl src_error_2	430	andi. r6,r5,1
299	src_error_2:	431	beq .Lcopy_finish
300	li r6,0	432
301	96: stwu r6,4(r4)	433	source; lbz r6,0(r3)
302	bdnz 96b	434	sldi r9,r6,8 /* Pad the byte out to 16 bits */
303	3: andi. r5,r5,3	435	adde r0,r0,r9
304	beq src_error	436	dest; stb r6,0(r4)
305	.globl src_error_3	437
306	src_error_3:	438	.Lcopy_finish:
307	li r6,0	439	addze r0,r0 /* add in final carry */
308	mtctr r5	440	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
309	addi r4,r4,3	441	add r3,r4,r0
310	97: stbu r6,1(r4)	442	srdi r3,r3,32
311	bdnz 97b	443	blr
312	.globl src_error	444
313	src_error:	445	.Lsrc_error:
314	cmpdi 0,r7,0	446	cmpdi 0,r7,0
315	beq 1f	447	beqlr
316	li r6,-EFAULT	448	li r6,-EFAULT
317	stw r6,0(r7)	449	stw r6,0(r7)
318	1: addze r3,r0
319	blr	450	blr
320		451
321	.globl dst_error	452	.Ldest_error:
322	dst_error:
323	cmpdi 0,r8,0	453	cmpdi 0,r8,0
324	beq 1f	454	beqlr
325	li r6,-EFAULT	455	li r6,-EFAULT
326	stw r6,0(r8)	456	stw r6,0(r8)
327	1: addze r3,r0
328	blr	457	blr
329
330	.section __ex_table,"a"
331	.align 3
332	.llong 81b,src_error_1
333	.llong 91b,dst_error
334	.llong 82b,src_error_2
335	.llong 92b,dst_error
336	.llong 83b,src_error_3
337	.llong 93b,dst_error
338	.llong 84b,src_error_3
339	.llong 94b,dst_error
340	.llong 95b,dst_error
341	.llong 96b,dst_error
342	.llong 97b,dst_error