Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/powerpc/lib/checksum_64.S
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
1 files changed, 355 insertions, 127 deletions
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index ef96c6c58efc..18245af38aea 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -65,165 +65,393 @@ _GLOBAL(csum_tcpudp_magic)
        srwi    r3,r3,16
        blr
+#define STACKFRAMESIZE 256
+#define STK_REG(i)      (112 + ((i)-14)*8)
 /*
 * Computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit).
 *
- * This code assumes at least halfword alignment, though the length
- * can be any number of bytes.  The sum is accumulated in r5.
- *
 * csum_partial(r3=buff, r4=len, r5=sum)
 */
 _GLOBAL(csum_partial)
-        subi    r3,r3,8         /* we'll offset by 8 for the loads */
+        addic   r0,r5,0                 /* clear carry */
-        srdi.   r6,r4,3         /* divide by 8 for doubleword count */
-        addic   r5,r5,0         /* clear carry */
+        srdi.   r6,r4,3                 /* less than 8 bytes? */
-        beq     3f              /* if we're doing < 8 bytes */
+        beq     .Lcsum_tail_word
-        andi.   r0,r3,2         /* aligned on a word boundary already? */
-        beq+    1f
+        /*
-        lhz     r6,8(r3)        /* do 2 bytes to get aligned */
+         * If only halfword aligned, align to a double word. Since odd
-        addi    r3,r3,2
+         * aligned addresses should be rare and they would require more
-        subi    r4,r4,2
+         * work to calculate the correct checksum, we ignore that case
-        addc    r5,r5,r6
+         * and take the potential slowdown of unaligned loads.
-        srdi.   r6,r4,3         /* recompute number of doublewords */
+         */
-        beq     3f              /* any left? */
+        rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
-1:      mtctr   r6
+        beq     .Lcsum_aligned
-2:      ldu     r6,8(r3)        /* main sum loop */
-        adde    r5,r5,r6
+        li      r7,4
-        bdnz    2b
+        sub     r6,r7,r6
-        andi.   r4,r4,7         /* compute bytes left to sum after doublewords */
+        mtctr   r6
-3:      cmpwi   0,r4,4          /* is at least a full word left? */
-        blt     4f
+1:
-        lwz     r6,8(r3)        /* sum this word */
+        lhz     r6,0(r3)                /* align to doubleword */
+        subi    r4,r4,2
+        addi    r3,r3,2
+        adde    r0,r0,r6
+        bdnz    1b
+.Lcsum_aligned:
+        /*
+         * We unroll the loop such that each iteration is 64 bytes with an
+         * entry and exit limb of 64 bytes, meaning a minimum size of
+         * 128 bytes.
+         */
+        srdi.   r6,r4,7
+        beq     .Lcsum_tail_doublewords         /* len < 128 */
+        srdi    r6,r4,6
+        subi    r6,r6,1
+        mtctr   r6
+        stdu    r1,-STACKFRAMESIZE(r1)
+        std     r14,STK_REG(r14)(r1)
+        std     r15,STK_REG(r15)(r1)
+        std     r16,STK_REG(r16)(r1)
+        ld      r6,0(r3)
+        ld      r9,8(r3)
+        ld      r10,16(r3)
+        ld      r11,24(r3)
+        /*
+         * On POWER6 and POWER7 back to back addes take 2 cycles because of
+         * the XER dependency. This means the fastest this loop can go is
+         * 16 cycles per iteration. The scheduling of the loop below has
+         * been shown to hit this on both POWER6 and POWER7.
+         */
+        .align 5
+2:
+        adde    r0,r0,r6
+        ld      r12,32(r3)
+        ld      r14,40(r3)
+        adde    r0,r0,r9
+        ld      r15,48(r3)
+        ld      r16,56(r3)
+        addi    r3,r3,64
+        adde    r0,r0,r10
+        adde    r0,r0,r11
+        adde    r0,r0,r12
+        adde    r0,r0,r14
+        adde    r0,r0,r15
+        ld      r6,0(r3)
+        ld      r9,8(r3)
+        adde    r0,r0,r16
+        ld      r10,16(r3)
+        ld      r11,24(r3)
+        bdnz    2b
+        adde    r0,r0,r6
+        ld      r12,32(r3)
+        ld      r14,40(r3)
+        adde    r0,r0,r9
+        ld      r15,48(r3)
+        ld      r16,56(r3)
+        addi    r3,r3,64
+        adde    r0,r0,r10
+        adde    r0,r0,r11
+        adde    r0,r0,r12
+        adde    r0,r0,r14
+        adde    r0,r0,r15
+        adde    r0,r0,r16
+        ld      r14,STK_REG(r14)(r1)
+        ld      r15,STK_REG(r15)(r1)
+        ld      r16,STK_REG(r16)(r1)
+        addi    r1,r1,STACKFRAMESIZE
+        andi.   r4,r4,63
+.Lcsum_tail_doublewords:                /* Up to 127 bytes to go */
+        srdi.   r6,r4,3
+        beq     .Lcsum_tail_word
+        mtctr   r6
+3:
+        ld      r6,0(r3)
+        addi    r3,r3,8
+        adde    r0,r0,r6
+        bdnz    3b
+        andi.   r4,r4,7
+.Lcsum_tail_word:                       /* Up to 7 bytes to go */
+        srdi.   r6,r4,2
+        beq     .Lcsum_tail_halfword
+        lwz     r6,0(r3)
        addi    r3,r3,4
+        adde    r0,r0,r6
        subi    r4,r4,4
-        adde    r5,r5,r6
-4:      cmpwi   0,r4,2          /* is at least a halfword left? */
+.Lcsum_tail_halfword:                   /* Up to 3 bytes to go */
-        blt+    5f
+        srdi.   r6,r4,1
-        lhz     r6,8(r3)        /* sum this halfword */
+        beq     .Lcsum_tail_byte
-        addi    r3,r3,2
-        subi    r4,r4,2
+        lhz     r6,0(r3)
-        adde    r5,r5,r6
+        addi    r3,r3,2
-5:      cmpwi   0,r4,1          /* is at least a byte left? */
+        adde    r0,r0,r6
-        bne+    6f
+        subi    r4,r4,2
-        lbz     r6,8(r3)        /* sum this byte */
-        slwi    r6,r6,8         /* this byte is assumed to be the upper byte of a halfword */
+.Lcsum_tail_byte:                       /* Up to 1 byte to go */
-        adde    r5,r5,r6
+        andi.   r6,r4,1
-6:      addze   r5,r5           /* add in final carry */
+        beq     .Lcsum_finish
-        rldicl  r4,r5,32,0      /* fold two 32-bit halves together */
-        add     r3,r4,r5
+        lbz     r6,0(r3)
-        srdi    r3,r3,32
+        sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
-        blr
+        adde    r0,r0,r9
+.Lcsum_finish:
+        addze   r0,r0                   /* add in final carry */
+        rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
+        add     r3,r4,r0
+        srdi    r3,r3,32
+        blr
+        .macro source
+100:
+        .section __ex_table,"a"
+        .align 3
+        .llong 100b,.Lsrc_error
+        .previous
+        .endm
+        .macro dest
+200:
+        .section __ex_table,"a"
+        .align 3
+        .llong 200b,.Ldest_error
+        .previous
+        .endm
 /*
 * Computes the checksum of a memory block at src, length len,
 * and adds in "sum" (32-bit), while copying the block to dst.
 * If an access exception occurs on src or dst, it stores -EFAULT
- * to *src_err or *dst_err respectively, and (for an error on
+ * to *src_err or *dst_err respectively. The caller must take any action
- * src) zeroes the rest of dst.
+ * required in this case (zeroing memory, recalculating partial checksum etc).
- *
- * This code needs to be reworked to take advantage of 64 bit sum+copy.
- * However, due to tokenring halfword alignment problems this will be very
- * tricky.  For now we'll leave it until we instrument it somehow.
 *
 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 */
 _GLOBAL(csum_partial_copy_generic)
-        addic   r0,r6,0
+        addic   r0,r6,0                 /* clear carry */
-        subi    r3,r3,4
-        subi    r4,r4,4
+        srdi.   r6,r5,3                 /* less than 8 bytes? */
-        srwi.   r6,r5,2
+        beq     .Lcopy_tail_word
-        beq     3f              /* if we're doing < 4 bytes */
-        andi.   r9,r4,2         /* Align dst to longword boundary */
+        /*
-        beq+    1f
+         * If only halfword aligned, align to a double word. Since odd
-81:     lhz     r6,4(r3)        /* do 2 bytes to get aligned */
+         * aligned addresses should be rare and they would require more
-        addi    r3,r3,2
+         * work to calculate the correct checksum, we ignore that case
+         * and take the potential slowdown of unaligned loads.
+         *
+         * If the source and destination are relatively unaligned we only
+         * align the source. This keeps things simple.
+         */
+        rldicl. r6,r3,64-1,64-2         /* r6 = (r3 & 0x3) >> 1 */
+        beq     .Lcopy_aligned
+        li      r7,4
+        sub     r6,r7,r6
+        mtctr   r6
+1:
+source; lhz     r6,0(r3)                /* align to doubleword */
        subi    r5,r5,2
-91:     sth     r6,4(r4)
-        addi    r4,r4,2
-        addc    r0,r0,r6
-        srwi.   r6,r5,2         /* # words to do */
-        beq     3f
-1:      mtctr   r6
-82:     lwzu    r6,4(r3)        /* the bdnz has zero overhead, so it should */
-92:     stwu    r6,4(r4)        /* be unnecessary to unroll this loop */
-        adde    r0,r0,r6
-        bdnz    82b
-        andi.   r5,r5,3
-3:      cmpwi   0,r5,2
-        blt+    4f
-83:     lhz     r6,4(r3)
        addi    r3,r3,2
-        subi    r5,r5,2
+        adde    r0,r0,r6
-93:     sth     r6,4(r4)
+dest;   sth     r6,0(r4)
        addi    r4,r4,2
+        bdnz    1b
+.Lcopy_aligned:
+        /*
+         * We unroll the loop such that each iteration is 64 bytes with an
+         * entry and exit limb of 64 bytes, meaning a minimum size of
+         * 128 bytes.
+         */
+        srdi.   r6,r5,7
+        beq     .Lcopy_tail_doublewords         /* len < 128 */
+        srdi    r6,r5,6
+        subi    r6,r6,1
+        mtctr   r6
+        stdu    r1,-STACKFRAMESIZE(r1)
+        std     r14,STK_REG(r14)(r1)
+        std     r15,STK_REG(r15)(r1)
+        std     r16,STK_REG(r16)(r1)
+source; ld      r6,0(r3)
+source; ld      r9,8(r3)
+source; ld      r10,16(r3)
+source; ld      r11,24(r3)
+        /*
+         * On POWER6 and POWER7 back to back addes take 2 cycles because of
+         * the XER dependency. This means the fastest this loop can go is
+         * 16 cycles per iteration. The scheduling of the loop below has
+         * been shown to hit this on both POWER6 and POWER7.
+         */
+        .align 5
+2:
        adde    r0,r0,r6
-4:      cmpwi   0,r5,1
+source; ld      r12,32(r3)
-        bne+    5f
+source; ld      r14,40(r3)
-84:     lbz     r6,4(r3)
-94:     stb     r6,4(r4)
+        adde    r0,r0,r9
-        slwi    r6,r6,8         /* Upper byte of word */
+source; ld      r15,48(r3)
+source; ld      r16,56(r3)
+        addi    r3,r3,64
+        adde    r0,r0,r10
+dest;   std     r6,0(r4)
+dest;   std     r9,8(r4)
+        adde    r0,r0,r11
+dest;   std     r10,16(r4)
+dest;   std     r11,24(r4)
+        adde    r0,r0,r12
+dest;   std     r12,32(r4)
+dest;   std     r14,40(r4)
+        adde    r0,r0,r14
+dest;   std     r15,48(r4)
+dest;   std     r16,56(r4)
+        addi    r4,r4,64
+        adde    r0,r0,r15
+source; ld      r6,0(r3)
+source; ld      r9,8(r3)
+        adde    r0,r0,r16
+source; ld      r10,16(r3)
+source; ld      r11,24(r3)
+        bdnz    2b
        adde    r0,r0,r6
-5:      addze   r3,r0           /* add in final carry (unlikely with 64-bit regs) */
+source; ld      r12,32(r3)
-        rldicl  r4,r3,32,0      /* fold 64 bit value */
+source; ld      r14,40(r3)
-        add     r3,r4,r3
-        srdi    r3,r3,32
-        blr
-/* These shouldn't go in the fixup section, since that would
+        adde    r0,r0,r9
-   cause the ex_table addresses to get out of order. */
+source; ld      r15,48(r3)
+source; ld      r16,56(r3)
+        addi    r3,r3,64
+        adde    r0,r0,r10
+dest;   std     r6,0(r4)
+dest;   std     r9,8(r4)
+        adde    r0,r0,r11
+dest;   std     r10,16(r4)
+dest;   std     r11,24(r4)
+        adde    r0,r0,r12
+dest;   std     r12,32(r4)
+dest;   std     r14,40(r4)
+        adde    r0,r0,r14
+dest;   std     r15,48(r4)
+dest;   std     r16,56(r4)
+        addi    r4,r4,64
+        adde    r0,r0,r15
+        adde    r0,r0,r16
+        ld      r14,STK_REG(r14)(r1)
+        ld      r15,STK_REG(r15)(r1)
+        ld      r16,STK_REG(r16)(r1)
+        addi    r1,r1,STACKFRAMESIZE
+        andi.   r5,r5,63
+.Lcopy_tail_doublewords:                /* Up to 127 bytes to go */
+        srdi.   r6,r5,3
+        beq     .Lcopy_tail_word
-        .globl src_error_1
-src_error_1:
-        li      r6,0
-        subi    r5,r5,2
-95:     sth     r6,4(r4)
-        addi    r4,r4,2
-        srwi.   r6,r5,2
-        beq     3f
        mtctr   r6
-        .globl src_error_2
+3:
-src_error_2:
+source; ld      r6,0(r3)
-        li      r6,0
+        addi    r3,r3,8
-96:     stwu    r6,4(r4)
+        adde    r0,r0,r6
-        bdnz    96b
+dest;   std     r6,0(r4)
-3:      andi.   r5,r5,3
+        addi    r4,r4,8
-        beq     src_error
+        bdnz    3b
-        .globl src_error_3
-src_error_3:
+        andi.   r5,r5,7
-        li      r6,0
-        mtctr   r5
+.Lcopy_tail_word:                       /* Up to 7 bytes to go */
-        addi    r4,r4,3
+        srdi.   r6,r5,2
-97:     stbu    r6,1(r4)
+        beq     .Lcopy_tail_halfword
-        bdnz    97b
-        .globl src_error
+source; lwz     r6,0(r3)
-src_error:
+        addi    r3,r3,4
+        adde    r0,r0,r6
+dest;   stw     r6,0(r4)
+        addi    r4,r4,4
+        subi    r5,r5,4
+.Lcopy_tail_halfword:                   /* Up to 3 bytes to go */
+        srdi.   r6,r5,1
+        beq     .Lcopy_tail_byte
+source; lhz     r6,0(r3)
+        addi    r3,r3,2
+        adde    r0,r0,r6
+dest;   sth     r6,0(r4)
+        addi    r4,r4,2
+        subi    r5,r5,2
+.Lcopy_tail_byte:                       /* Up to 1 byte to go */
+        andi.   r6,r5,1
+        beq     .Lcopy_finish
+source; lbz     r6,0(r3)
+        sldi    r9,r6,8                 /* Pad the byte out to 16 bits */
+        adde    r0,r0,r9
+dest;   stb     r6,0(r4)
+.Lcopy_finish:
+        addze   r0,r0                   /* add in final carry */
+        rldicl  r4,r0,32,0              /* fold two 32 bit halves together */
+        add     r3,r4,r0
+        srdi    r3,r3,32
+        blr
+.Lsrc_error:
        cmpdi   0,r7,0
-        beq     1f
+        beqlr
        li      r6,-EFAULT
        stw     r6,0(r7)
-1:      addze   r3,r0
        blr
-        .globl dst_error
+.Ldest_error:
-dst_error:
        cmpdi   0,r8,0
-        beq     1f
+        beqlr
        li      r6,-EFAULT
        stw     r6,0(r8)
-1:      addze   r3,r0
        blr
-.section __ex_table,"a"
-        .align  3
-        .llong  81b,src_error_1
-        .llong  91b,dst_error
-        .llong  82b,src_error_2
-        .llong  92b,dst_error
-        .llong  83b,src_error_3
-        .llong  93b,dst_error
-        .llong  84b,src_error_3
-        .llong  94b,dst_error
-        .llong  95b,dst_error
-        .llong  96b,dst_error
-        .llong  97b,dst_error
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /arch/powerpc/lib/checksum_64.S
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)

diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S index ef96c6c58efc..18245af38aea 100644 --- a/arch/powerpc/lib/checksum_64.S +++ b/arch/powerpc/lib/checksum_64.S
@@ -65,165 +65,393 @@ _GLOBAL(csum_tcpudp_magic)
65	srwi r3,r3,16	65	srwi r3,r3,16
66	blr	66	blr
67		67
		68	#define STACKFRAMESIZE 256
		69	#define STK_REG(i) (112 + ((i)-14)*8)
		70
68	/*	71	/*
69	* Computes the checksum of a memory block at buff, length len,	72	* Computes the checksum of a memory block at buff, length len,
70	* and adds in "sum" (32-bit).	73	* and adds in "sum" (32-bit).
71	*	74	*
72	* This code assumes at least halfword alignment, though the length
73	* can be any number of bytes. The sum is accumulated in r5.
74	*
75	* csum_partial(r3=buff, r4=len, r5=sum)	75	* csum_partial(r3=buff, r4=len, r5=sum)
76	*/	76	*/
77	_GLOBAL(csum_partial)	77	_GLOBAL(csum_partial)
78	subi r3,r3,8 /* we'll offset by 8 for the loads */	78	addic r0,r5,0 /* clear carry */
79	srdi. r6,r4,3 /* divide by 8 for doubleword count */	79
80	addic r5,r5,0 /* clear carry */	80	srdi. r6,r4,3 /* less than 8 bytes? */
81	beq 3f /* if we're doing < 8 bytes */	81	beq .Lcsum_tail_word
82	andi. r0,r3,2 /* aligned on a word boundary already? */	82
83	beq+ 1f	83	/*
84	lhz r6,8(r3) /* do 2 bytes to get aligned */	84	* If only halfword aligned, align to a double word. Since odd
85	addi r3,r3,2	85	* aligned addresses should be rare and they would require more
86	subi r4,r4,2	86	* work to calculate the correct checksum, we ignore that case
87	addc r5,r5,r6	87	* and take the potential slowdown of unaligned loads.
88	srdi. r6,r4,3 /* recompute number of doublewords */	88	*/
89	beq 3f /* any left? */	89	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
90	1: mtctr r6	90	beq .Lcsum_aligned
91	2: ldu r6,8(r3) /* main sum loop */	91
92	adde r5,r5,r6	92	li r7,4
93	bdnz 2b	93	sub r6,r7,r6
94	andi. r4,r4,7 /* compute bytes left to sum after doublewords */	94	mtctr r6
95	3: cmpwi 0,r4,4 /* is at least a full word left? */	95
96	blt 4f	96	1:
97	lwz r6,8(r3) /* sum this word */	97	lhz r6,0(r3) /* align to doubleword */
		98	subi r4,r4,2
		99	addi r3,r3,2
		100	adde r0,r0,r6
		101	bdnz 1b
		102
		103	.Lcsum_aligned:
		104	/*
		105	* We unroll the loop such that each iteration is 64 bytes with an
		106	* entry and exit limb of 64 bytes, meaning a minimum size of
		107	* 128 bytes.
		108	*/
		109	srdi. r6,r4,7
		110	beq .Lcsum_tail_doublewords /* len < 128 */
		111
		112	srdi r6,r4,6
		113	subi r6,r6,1
		114	mtctr r6
		115
		116	stdu r1,-STACKFRAMESIZE(r1)
		117	std r14,STK_REG(r14)(r1)
		118	std r15,STK_REG(r15)(r1)
		119	std r16,STK_REG(r16)(r1)
		120
		121	ld r6,0(r3)
		122	ld r9,8(r3)
		123
		124	ld r10,16(r3)
		125	ld r11,24(r3)
		126
		127	/*
		128	* On POWER6 and POWER7 back to back addes take 2 cycles because of
		129	* the XER dependency. This means the fastest this loop can go is
		130	* 16 cycles per iteration. The scheduling of the loop below has
		131	* been shown to hit this on both POWER6 and POWER7.
		132	*/
		133	.align 5
		134	2:
		135	adde r0,r0,r6
		136	ld r12,32(r3)
		137	ld r14,40(r3)
		138
		139	adde r0,r0,r9
		140	ld r15,48(r3)
		141	ld r16,56(r3)
		142	addi r3,r3,64
		143
		144	adde r0,r0,r10
		145
		146	adde r0,r0,r11
		147
		148	adde r0,r0,r12
		149
		150	adde r0,r0,r14
		151
		152	adde r0,r0,r15
		153	ld r6,0(r3)
		154	ld r9,8(r3)
		155
		156	adde r0,r0,r16
		157	ld r10,16(r3)
		158	ld r11,24(r3)
		159	bdnz 2b
		160
		161
		162	adde r0,r0,r6
		163	ld r12,32(r3)
		164	ld r14,40(r3)
		165
		166	adde r0,r0,r9
		167	ld r15,48(r3)
		168	ld r16,56(r3)
		169	addi r3,r3,64
		170
		171	adde r0,r0,r10
		172	adde r0,r0,r11
		173	adde r0,r0,r12
		174	adde r0,r0,r14
		175	adde r0,r0,r15
		176	adde r0,r0,r16
		177
		178	ld r14,STK_REG(r14)(r1)
		179	ld r15,STK_REG(r15)(r1)
		180	ld r16,STK_REG(r16)(r1)
		181	addi r1,r1,STACKFRAMESIZE
		182
		183	andi. r4,r4,63
		184
		185	.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
		186	srdi. r6,r4,3
		187	beq .Lcsum_tail_word
		188
		189	mtctr r6
		190	3:
		191	ld r6,0(r3)
		192	addi r3,r3,8
		193	adde r0,r0,r6
		194	bdnz 3b
		195
		196	andi. r4,r4,7
		197
		198	.Lcsum_tail_word: /* Up to 7 bytes to go */
		199	srdi. r6,r4,2
		200	beq .Lcsum_tail_halfword
		201
		202	lwz r6,0(r3)
98	addi r3,r3,4	203	addi r3,r3,4
		204	adde r0,r0,r6
99	subi r4,r4,4	205	subi r4,r4,4
100	adde r5,r5,r6	206
101	4: cmpwi 0,r4,2 /* is at least a halfword left? */	207	.Lcsum_tail_halfword: /* Up to 3 bytes to go */
102	blt+ 5f	208	srdi. r6,r4,1
103	lhz r6,8(r3) /* sum this halfword */	209	beq .Lcsum_tail_byte
104	addi r3,r3,2	210
105	subi r4,r4,2	211	lhz r6,0(r3)
106	adde r5,r5,r6	212	addi r3,r3,2
107	5: cmpwi 0,r4,1 /* is at least a byte left? */	213	adde r0,r0,r6
108	bne+ 6f	214	subi r4,r4,2
109	lbz r6,8(r3) /* sum this byte */	215
110	slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */	216	.Lcsum_tail_byte: /* Up to 1 byte to go */
111	adde r5,r5,r6	217	andi. r6,r4,1
112	6: addze r5,r5 /* add in final carry */	218	beq .Lcsum_finish
113	rldicl r4,r5,32,0 /* fold two 32-bit halves together */	219
114	add r3,r4,r5	220	lbz r6,0(r3)
115	srdi r3,r3,32	221	sldi r9,r6,8 /* Pad the byte out to 16 bits */
116	blr	222	adde r0,r0,r9
		223
		224	.Lcsum_finish:
		225	addze r0,r0 /* add in final carry */
		226	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
		227	add r3,r4,r0
		228	srdi r3,r3,32
		229	blr
		230
		231
		232	.macro source
		233	100:
		234	.section __ex_table,"a"
		235	.align 3
		236	.llong 100b,.Lsrc_error
		237	.previous
		238	.endm
		239
		240	.macro dest
		241	200:
		242	.section __ex_table,"a"
		243	.align 3
		244	.llong 200b,.Ldest_error
		245	.previous
		246	.endm
117		247
118	/*	248	/*
119	* Computes the checksum of a memory block at src, length len,	249	* Computes the checksum of a memory block at src, length len,
120	* and adds in "sum" (32-bit), while copying the block to dst.	250	* and adds in "sum" (32-bit), while copying the block to dst.
121	* If an access exception occurs on src or dst, it stores -EFAULT	251	* If an access exception occurs on src or dst, it stores -EFAULT
122	* to src_err or dst_err respectively, and (for an error on	252	* to src_err or dst_err respectively. The caller must take any action
123	* src) zeroes the rest of dst.	253	* required in this case (zeroing memory, recalculating partial checksum etc).
124	*
125	* This code needs to be reworked to take advantage of 64 bit sum+copy.
126	* However, due to tokenring halfword alignment problems this will be very
127	* tricky. For now we'll leave it until we instrument it somehow.
128	*	254	*
129	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)	255	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
130	*/	256	*/
131	_GLOBAL(csum_partial_copy_generic)	257	_GLOBAL(csum_partial_copy_generic)
132	addic r0,r6,0	258	addic r0,r6,0 /* clear carry */
133	subi r3,r3,4	259
134	subi r4,r4,4	260	srdi. r6,r5,3 /* less than 8 bytes? */
135	srwi. r6,r5,2	261	beq .Lcopy_tail_word
136	beq 3f /* if we're doing < 4 bytes */	262
137	andi. r9,r4,2 /* Align dst to longword boundary */	263	/*
138	beq+ 1f	264	* If only halfword aligned, align to a double word. Since odd
139	81: lhz r6,4(r3) /* do 2 bytes to get aligned */	265	* aligned addresses should be rare and they would require more
140	addi r3,r3,2	266	* work to calculate the correct checksum, we ignore that case
		267	* and take the potential slowdown of unaligned loads.
		268	*
		269	* If the source and destination are relatively unaligned we only
		270	* align the source. This keeps things simple.
		271	*/
		272	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
		273	beq .Lcopy_aligned
		274
		275	li r7,4
		276	sub r6,r7,r6
		277	mtctr r6
		278
		279	1:
		280	source; lhz r6,0(r3) /* align to doubleword */
141	subi r5,r5,2	281	subi r5,r5,2
142	91: sth r6,4(r4)
143	addi r4,r4,2
144	addc r0,r0,r6
145	srwi. r6,r5,2 /* # words to do */
146	beq 3f
147	1: mtctr r6
148	82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */
149	92: stwu r6,4(r4) /* be unnecessary to unroll this loop */
150	adde r0,r0,r6
151	bdnz 82b
152	andi. r5,r5,3
153	3: cmpwi 0,r5,2
154	blt+ 4f
155	83: lhz r6,4(r3)
156	addi r3,r3,2	282	addi r3,r3,2
157	subi r5,r5,2	283	adde r0,r0,r6
158	93: sth r6,4(r4)	284	dest; sth r6,0(r4)
159	addi r4,r4,2	285	addi r4,r4,2
		286	bdnz 1b
		287
		288	.Lcopy_aligned:
		289	/*
		290	* We unroll the loop such that each iteration is 64 bytes with an
		291	* entry and exit limb of 64 bytes, meaning a minimum size of
		292	* 128 bytes.
		293	*/
		294	srdi. r6,r5,7
		295	beq .Lcopy_tail_doublewords /* len < 128 */
		296
		297	srdi r6,r5,6
		298	subi r6,r6,1
		299	mtctr r6
		300
		301	stdu r1,-STACKFRAMESIZE(r1)
		302	std r14,STK_REG(r14)(r1)
		303	std r15,STK_REG(r15)(r1)
		304	std r16,STK_REG(r16)(r1)
		305
		306	source; ld r6,0(r3)
		307	source; ld r9,8(r3)
		308
		309	source; ld r10,16(r3)
		310	source; ld r11,24(r3)
		311
		312	/*
		313	* On POWER6 and POWER7 back to back addes take 2 cycles because of
		314	* the XER dependency. This means the fastest this loop can go is
		315	* 16 cycles per iteration. The scheduling of the loop below has
		316	* been shown to hit this on both POWER6 and POWER7.
		317	*/
		318	.align 5
		319	2:
160	adde r0,r0,r6	320	adde r0,r0,r6
161	4: cmpwi 0,r5,1	321	source; ld r12,32(r3)
162	bne+ 5f	322	source; ld r14,40(r3)
163	84: lbz r6,4(r3)	323
164	94: stb r6,4(r4)	324	adde r0,r0,r9
165	slwi r6,r6,8 /* Upper byte of word */	325	source; ld r15,48(r3)
		326	source; ld r16,56(r3)
		327	addi r3,r3,64
		328
		329	adde r0,r0,r10
		330	dest; std r6,0(r4)
		331	dest; std r9,8(r4)
		332
		333	adde r0,r0,r11
		334	dest; std r10,16(r4)
		335	dest; std r11,24(r4)
		336
		337	adde r0,r0,r12
		338	dest; std r12,32(r4)
		339	dest; std r14,40(r4)
		340
		341	adde r0,r0,r14
		342	dest; std r15,48(r4)
		343	dest; std r16,56(r4)
		344	addi r4,r4,64
		345
		346	adde r0,r0,r15
		347	source; ld r6,0(r3)
		348	source; ld r9,8(r3)
		349
		350	adde r0,r0,r16
		351	source; ld r10,16(r3)
		352	source; ld r11,24(r3)
		353	bdnz 2b
		354
		355
166	adde r0,r0,r6	356	adde r0,r0,r6
167	5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */	357	source; ld r12,32(r3)
168	rldicl r4,r3,32,0 /* fold 64 bit value */	358	source; ld r14,40(r3)
169	add r3,r4,r3
170	srdi r3,r3,32
171	blr
172		359
173	/* These shouldn't go in the fixup section, since that would	360	adde r0,r0,r9
174	cause the ex_table addresses to get out of order. */	361	source; ld r15,48(r3)
		362	source; ld r16,56(r3)
		363	addi r3,r3,64
		364
		365	adde r0,r0,r10
		366	dest; std r6,0(r4)
		367	dest; std r9,8(r4)
		368
		369	adde r0,r0,r11
		370	dest; std r10,16(r4)
		371	dest; std r11,24(r4)
		372
		373	adde r0,r0,r12
		374	dest; std r12,32(r4)
		375	dest; std r14,40(r4)
		376
		377	adde r0,r0,r14
		378	dest; std r15,48(r4)
		379	dest; std r16,56(r4)
		380	addi r4,r4,64
		381
		382	adde r0,r0,r15
		383	adde r0,r0,r16
		384
		385	ld r14,STK_REG(r14)(r1)
		386	ld r15,STK_REG(r15)(r1)
		387	ld r16,STK_REG(r16)(r1)
		388	addi r1,r1,STACKFRAMESIZE
		389
		390	andi. r5,r5,63
		391
		392	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
		393	srdi. r6,r5,3
		394	beq .Lcopy_tail_word
175		395
176	.globl src_error_1
177	src_error_1:
178	li r6,0
179	subi r5,r5,2
180	95: sth r6,4(r4)
181	addi r4,r4,2
182	srwi. r6,r5,2
183	beq 3f
184	mtctr r6	396	mtctr r6
185	.globl src_error_2	397	3:
186	src_error_2:	398	source; ld r6,0(r3)
187	li r6,0	399	addi r3,r3,8
188	96: stwu r6,4(r4)	400	adde r0,r0,r6
189	bdnz 96b	401	dest; std r6,0(r4)
190	3: andi. r5,r5,3	402	addi r4,r4,8
191	beq src_error	403	bdnz 3b
192	.globl src_error_3	404
193	src_error_3:	405	andi. r5,r5,7
194	li r6,0	406
195	mtctr r5	407	.Lcopy_tail_word: /* Up to 7 bytes to go */
196	addi r4,r4,3	408	srdi. r6,r5,2
197	97: stbu r6,1(r4)	409	beq .Lcopy_tail_halfword
198	bdnz 97b	410
199	.globl src_error	411	source; lwz r6,0(r3)
200	src_error:	412	addi r3,r3,4
		413	adde r0,r0,r6
		414	dest; stw r6,0(r4)
		415	addi r4,r4,4
		416	subi r5,r5,4
		417
		418	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
		419	srdi. r6,r5,1
		420	beq .Lcopy_tail_byte
		421
		422	source; lhz r6,0(r3)
		423	addi r3,r3,2
		424	adde r0,r0,r6
		425	dest; sth r6,0(r4)
		426	addi r4,r4,2
		427	subi r5,r5,2
		428
		429	.Lcopy_tail_byte: /* Up to 1 byte to go */
		430	andi. r6,r5,1
		431	beq .Lcopy_finish
		432
		433	source; lbz r6,0(r3)
		434	sldi r9,r6,8 /* Pad the byte out to 16 bits */
		435	adde r0,r0,r9
		436	dest; stb r6,0(r4)
		437
		438	.Lcopy_finish:
		439	addze r0,r0 /* add in final carry */
		440	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
		441	add r3,r4,r0
		442	srdi r3,r3,32
		443	blr
		444
		445	.Lsrc_error:
201	cmpdi 0,r7,0	446	cmpdi 0,r7,0
202	beq 1f	447	beqlr
203	li r6,-EFAULT	448	li r6,-EFAULT
204	stw r6,0(r7)	449	stw r6,0(r7)
205	1: addze r3,r0
206	blr	450	blr
207		451
208	.globl dst_error	452	.Ldest_error:
209	dst_error:
210	cmpdi 0,r8,0	453	cmpdi 0,r8,0
211	beq 1f	454	beqlr
212	li r6,-EFAULT	455	li r6,-EFAULT
213	stw r6,0(r8)	456	stw r6,0(r8)
214	1: addze r3,r0
215	blr	457	blr
216
217	.section __ex_table,"a"
218	.align 3
219	.llong 81b,src_error_1
220	.llong 91b,dst_error
221	.llong 82b,src_error_2
222	.llong 92b,dst_error
223	.llong 83b,src_error_3
224	.llong 93b,dst_error
225	.llong 84b,src_error_3
226	.llong 94b,dst_error
227	.llong 95b,dst_error
228	.llong 96b,dst_error
229	.llong 97b,dst_error