aboutsummaryrefslogtreecommitdiffstats
path: root/arch/powerpc
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2010-08-02 16:08:34 -0400
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2010-09-02 00:07:29 -0400
commit9b83ecb0a3cf1bf7ecf84359ddcfb9dd49646bf2 (patch)
tree5ee6f0184cad6056917fcd9ecc4bfd479f7710c8 /arch/powerpc
parent93f68f1ef787d97ab688f78a01f446e85bb9a496 (diff)
powerpc: Optimise 64bit csum_partial
The main loop of csum_partial runs very slowly on recent POWER CPUs. After some analysis on both POWER6 and POWER7 I came up with routine below. First we get the source aligned to a double word, ignoring any odd alignment to keep things simple. Then we do 64 bytes at a time, with an entry and exit limb of a further 64 bytes. On both POWER6 and POWER7 this should be as fast as we can go since we are limited by the latency of the adde instructions. To test this I forced checksumming on over loopback and ran socklib (a simple TCP benchmark). On a POWER6 575 throughput improved by 11% with this patch. Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc')
-rw-r--r--arch/powerpc/lib/checksum_64.S193
1 files changed, 153 insertions, 40 deletions
diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S
index ef96c6c58ef..404d5a6e338 100644
--- a/arch/powerpc/lib/checksum_64.S
+++ b/arch/powerpc/lib/checksum_64.S
@@ -65,55 +65,168 @@ _GLOBAL(csum_tcpudp_magic)
65 srwi r3,r3,16 65 srwi r3,r3,16
66 blr 66 blr
67 67
68#define STACKFRAMESIZE 256
69#define STK_REG(i) (112 + ((i)-14)*8)
70
68/* 71/*
69 * Computes the checksum of a memory block at buff, length len, 72 * Computes the checksum of a memory block at buff, length len,
70 * and adds in "sum" (32-bit). 73 * and adds in "sum" (32-bit).
71 * 74 *
72 * This code assumes at least halfword alignment, though the length
73 * can be any number of bytes. The sum is accumulated in r5.
74 *
75 * csum_partial(r3=buff, r4=len, r5=sum) 75 * csum_partial(r3=buff, r4=len, r5=sum)
76 */ 76 */
77_GLOBAL(csum_partial) 77_GLOBAL(csum_partial)
78 subi r3,r3,8 /* we'll offset by 8 for the loads */ 78 addic r0,r5,0 /* clear carry */
79 srdi. r6,r4,3 /* divide by 8 for doubleword count */ 79
80 addic r5,r5,0 /* clear carry */ 80 srdi. r6,r4,3 /* less than 8 bytes? */
81 beq 3f /* if we're doing < 8 bytes */ 81 beq .Lcsum_tail_word
82 andi. r0,r3,2 /* aligned on a word boundary already? */ 82
83 beq+ 1f 83 /*
84 lhz r6,8(r3) /* do 2 bytes to get aligned */ 84 * If only halfword aligned, align to a double word. Since odd
85 addi r3,r3,2 85 * aligned addresses should be rare and they would require more
86 subi r4,r4,2 86 * work to calculate the correct checksum, we ignore that case
87 addc r5,r5,r6 87 * and take the potential slowdown of unaligned loads.
88 srdi. r6,r4,3 /* recompute number of doublewords */ 88 */
89 beq 3f /* any left? */ 89 rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
901: mtctr r6 90 beq .Lcsum_aligned
912: ldu r6,8(r3) /* main sum loop */ 91
92 adde r5,r5,r6 92 li r7,4
93 bdnz 2b 93 sub r6,r7,r6
94 andi. r4,r4,7 /* compute bytes left to sum after doublewords */ 94 mtctr r6
953: cmpwi 0,r4,4 /* is at least a full word left? */ 95
96 blt 4f 961:
97 lwz r6,8(r3) /* sum this word */ 97 lhz r6,0(r3) /* align to doubleword */
98 subi r4,r4,2
99 addi r3,r3,2
100 adde r0,r0,r6
101 bdnz 1b
102
103.Lcsum_aligned:
104 /*
105 * We unroll the loop such that each iteration is 64 bytes with an
106 * entry and exit limb of 64 bytes, meaning a minimum size of
107 * 128 bytes.
108 */
109 srdi. r6,r4,7
110 beq .Lcsum_tail_doublewords /* len < 128 */
111
112 srdi r6,r4,6
113 subi r6,r6,1
114 mtctr r6
115
116 stdu r1,-STACKFRAMESIZE(r1)
117 std r14,STK_REG(r14)(r1)
118 std r15,STK_REG(r15)(r1)
119 std r16,STK_REG(r16)(r1)
120
121 ld r6,0(r3)
122 ld r9,8(r3)
123
124 ld r10,16(r3)
125 ld r11,24(r3)
126
127 /*
128 * On POWER6 and POWER7 back to back addes take 2 cycles because of
129 * the XER dependency. This means the fastest this loop can go is
130 * 16 cycles per iteration. The scheduling of the loop below has
131 * been shown to hit this on both POWER6 and POWER7.
132 */
133 .align 5
1342:
135 adde r0,r0,r6
136 ld r12,32(r3)
137 ld r14,40(r3)
138
139 adde r0,r0,r9
140 ld r15,48(r3)
141 ld r16,56(r3)
142 addi r3,r3,64
143
144 adde r0,r0,r10
145
146 adde r0,r0,r11
147
148 adde r0,r0,r12
149
150 adde r0,r0,r14
151
152 adde r0,r0,r15
153 ld r6,0(r3)
154 ld r9,8(r3)
155
156 adde r0,r0,r16
157 ld r10,16(r3)
158 ld r11,24(r3)
159 bdnz 2b
160
161
162 adde r0,r0,r6
163 ld r12,32(r3)
164 ld r14,40(r3)
165
166 adde r0,r0,r9
167 ld r15,48(r3)
168 ld r16,56(r3)
169 addi r3,r3,64
170
171 adde r0,r0,r10
172 adde r0,r0,r11
173 adde r0,r0,r12
174 adde r0,r0,r14
175 adde r0,r0,r15
176 adde r0,r0,r16
177
178 ld r14,STK_REG(r14)(r1)
179 ld r15,STK_REG(r15)(r1)
180 ld r16,STK_REG(r16)(r1)
181 addi r1,r1,STACKFRAMESIZE
182
183 andi. r4,r4,63
184
185.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
186 srdi. r6,r4,3
187 beq .Lcsum_tail_word
188
189 mtctr r6
1903:
191 ld r6,0(r3)
192 addi r3,r3,8
193 adde r0,r0,r6
194 bdnz 3b
195
196 andi. r4,r4,7
197
198.Lcsum_tail_word: /* Up to 7 bytes to go */
199 srdi. r6,r4,2
200 beq .Lcsum_tail_halfword
201
202 lwz r6,0(r3)
98 addi r3,r3,4 203 addi r3,r3,4
204 adde r0,r0,r6
99 subi r4,r4,4 205 subi r4,r4,4
100 adde r5,r5,r6 206
1014: cmpwi 0,r4,2 /* is at least a halfword left? */ 207.Lcsum_tail_halfword: /* Up to 3 bytes to go */
102 blt+ 5f 208 srdi. r6,r4,1
103 lhz r6,8(r3) /* sum this halfword */ 209 beq .Lcsum_tail_byte
104 addi r3,r3,2 210
105 subi r4,r4,2 211 lhz r6,0(r3)
106 adde r5,r5,r6 212 addi r3,r3,2
1075: cmpwi 0,r4,1 /* is at least a byte left? */ 213 adde r0,r0,r6
108 bne+ 6f 214 subi r4,r4,2
109 lbz r6,8(r3) /* sum this byte */ 215
110 slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ 216.Lcsum_tail_byte: /* Up to 1 byte to go */
111 adde r5,r5,r6 217 andi. r6,r4,1
1126: addze r5,r5 /* add in final carry */ 218 beq .Lcsum_finish
113 rldicl r4,r5,32,0 /* fold two 32-bit halves together */ 219
114 add r3,r4,r5 220 lbz r6,0(r3)
115 srdi r3,r3,32 221 sldi r9,r6,8 /* Pad the byte out to 16 bits */
116 blr 222 adde r0,r0,r9
223
224.Lcsum_finish:
225 addze r0,r0 /* add in final carry */
226 rldicl r4,r0,32,0 /* fold two 32 bit halves together */
227 add r3,r4,r0
228 srdi r3,r3,32
229 blr
117 230
118/* 231/*
119 * Computes the checksum of a memory block at src, length len, 232 * Computes the checksum of a memory block at src, length len,