aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Blanchard <anton@samba.org>2010-02-10 09:56:26 -0500
committerBenjamin Herrenschmidt <benh@kernel.crashing.org>2010-02-16 22:03:16 -0500
commit789c299ca280f96368c0296b739e89c0bb232f8a (patch)
treec14611126d351e6b69cb2db26afd4fbd77b3763f
parent63e6c5b8102af7df7a5e1cebbd865d711645886a (diff)
powerpc: Improve 64bit copy_tofrom_user
Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user. The loop now does 32 bytes at a time and as well as pairing loads and stores. A quick test case that reads 8kB over and over shows the improvement: POWER6: 53% faster POWER7: 51% faster #define _XOPEN_SOURCE 500 #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <sys/types.h> #include <sys/stat.h> #define BUFSIZE (8 * 1024) #define ITERATIONS 10000000 int main() { char tmpfile[] = "/tmp/copy_to_user_testXXXXXX"; int fd; char *buf[BUFSIZE]; unsigned long i; fd = mkstemp(tmpfile); if (fd < 0) { perror("open"); exit(1); } if (write(fd, buf, BUFSIZE) != BUFSIZE) { perror("open"); exit(1); } for (i = 0; i < 10000000; i++) { if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) { perror("pread"); exit(1); } } unlink(tmpfile); return 0; } Signed-off-by: Anton Blanchard <anton@samba.org> Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r--arch/powerpc/lib/copyuser_64.S80
1 files changed, 57 insertions, 23 deletions
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S
index 693b14a778fa..578b625d6a3c 100644
--- a/arch/powerpc/lib/copyuser_64.S
+++ b/arch/powerpc/lib/copyuser_64.S
@@ -44,37 +44,55 @@ BEGIN_FTR_SECTION
44 andi. r0,r4,7 44 andi. r0,r4,7
45 bne .Lsrc_unaligned 45 bne .Lsrc_unaligned
46END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) 46END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
47 srdi r7,r5,4 47 blt cr1,.Ldo_tail /* if < 16 bytes to copy */
4820: ld r9,0(r4) 48 srdi r0,r5,5
49 addi r4,r4,-8 49 cmpdi cr1,r0,0
50 mtctr r7 5020: ld r7,0(r4)
51 andi. r5,r5,7 51220: ld r6,8(r4)
52 bf cr7*4+0,22f 52 addi r4,r4,16
53 addi r3,r3,8 53 mtctr r0
54 addi r4,r4,8 54 andi. r0,r5,0x10
55 mr r8,r9 55 beq 22f
56 blt cr1,72f 56 addi r3,r3,16
5721: ld r9,8(r4) 57 addi r4,r4,-16
5870: std r8,8(r3) 58 mr r9,r7
5922: ldu r8,16(r4) 59 mr r8,r6
6071: stdu r9,16(r3) 60 beq cr1,72f
6121: ld r7,16(r4)
62221: ld r6,24(r4)
63 addi r4,r4,32
6470: std r9,0(r3)
65270: std r8,8(r3)
6622: ld r9,0(r4)
67222: ld r8,8(r4)
6871: std r7,16(r3)
69271: std r6,24(r3)
70 addi r3,r3,32
61 bdnz 21b 71 bdnz 21b
6272: std r8,8(r3) 7272: std r9,0(r3)
73272: std r8,8(r3)
74 andi. r5,r5,0xf
63 beq+ 3f 75 beq+ 3f
64 addi r3,r3,16 76 addi r4,r4,16
65.Ldo_tail: 77.Ldo_tail:
66 bf cr7*4+1,1f 78 addi r3,r3,16
6723: lwz r9,8(r4) 79 bf cr7*4+0,246f
80244: ld r9,0(r4)
81 addi r4,r4,8
82245: std r9,0(r3)
83 addi r3,r3,8
84246: bf cr7*4+1,1f
8523: lwz r9,0(r4)
68 addi r4,r4,4 86 addi r4,r4,4
6973: stw r9,0(r3) 8773: stw r9,0(r3)
70 addi r3,r3,4 88 addi r3,r3,4
711: bf cr7*4+2,2f 891: bf cr7*4+2,2f
7244: lhz r9,8(r4) 9044: lhz r9,0(r4)
73 addi r4,r4,2 91 addi r4,r4,2
7474: sth r9,0(r3) 9274: sth r9,0(r3)
75 addi r3,r3,2 93 addi r3,r3,2
762: bf cr7*4+3,3f 942: bf cr7*4+3,3f
7745: lbz r9,8(r4) 9545: lbz r9,0(r4)
7875: stb r9,0(r3) 9675: stb r9,0(r3)
793: li r3,0 973: li r3,0
80 blr 98 blr
@@ -220,7 +238,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
220131: 238131:
221 addi r3,r3,8 239 addi r3,r3,8
222120: 240120:
241320:
223122: 242122:
243322:
224124: 244124:
225125: 245125:
226126: 246126:
@@ -229,9 +249,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
229129: 249129:
230133: 250133:
231 addi r3,r3,8 251 addi r3,r3,8
232121:
233132: 252132:
234 addi r3,r3,8 253 addi r3,r3,8
254121:
255321:
256344:
235134: 257134:
236135: 258135:
237138: 259138:
@@ -303,18 +325,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
303183: 325183:
304 add r3,r3,r7 326 add r3,r3,r7
305 b 1f 327 b 1f
328371:
306180: 329180:
307 addi r3,r3,8 330 addi r3,r3,8
308171: 331171:
309177: 332177:
310 addi r3,r3,8 333 addi r3,r3,8
311170: 334370:
312172: 335372:
313176: 336176:
314178: 337178:
315 addi r3,r3,4 338 addi r3,r3,4
316185: 339185:
317 addi r3,r3,4 340 addi r3,r3,4
341170:
342172:
343345:
318173: 344173:
319174: 345174:
320175: 346175:
@@ -341,11 +367,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD)
341 .section __ex_table,"a" 367 .section __ex_table,"a"
342 .align 3 368 .align 3
343 .llong 20b,120b 369 .llong 20b,120b
370 .llong 220b,320b
344 .llong 21b,121b 371 .llong 21b,121b
372 .llong 221b,321b
345 .llong 70b,170b 373 .llong 70b,170b
374 .llong 270b,370b
346 .llong 22b,122b 375 .llong 22b,122b
376 .llong 222b,322b
347 .llong 71b,171b 377 .llong 71b,171b
378 .llong 271b,371b
348 .llong 72b,172b 379 .llong 72b,172b
380 .llong 272b,372b
381 .llong 244b,344b
382 .llong 245b,345b
349 .llong 23b,123b 383 .llong 23b,123b
350 .llong 73b,173b 384 .llong 73b,173b
351 .llong 44b,144b 385 .llong 44b,144b