diff options
author | Anton Blanchard <anton@samba.org> | 2010-02-10 09:56:26 -0500 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2010-02-16 22:03:16 -0500 |
commit | 789c299ca280f96368c0296b739e89c0bb232f8a (patch) | |
tree | c14611126d351e6b69cb2db26afd4fbd77b3763f /arch/powerpc/lib | |
parent | 63e6c5b8102af7df7a5e1cebbd865d711645886a (diff) |
powerpc: Improve 64bit copy_tofrom_user
Here is a patch from Paul Mackerras that improves the ppc64 copy_tofrom_user.
The loop now does 32 bytes at a time and as well as pairing loads and stores.
A quick test case that reads 8kB over and over shows the improvement:
POWER6: 53% faster
POWER7: 51% faster
#define _XOPEN_SOURCE 500
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#define BUFSIZE (8 * 1024)
#define ITERATIONS 10000000
int main()
{
char tmpfile[] = "/tmp/copy_to_user_testXXXXXX";
int fd;
char *buf[BUFSIZE];
unsigned long i;
fd = mkstemp(tmpfile);
if (fd < 0) {
perror("open");
exit(1);
}
if (write(fd, buf, BUFSIZE) != BUFSIZE) {
perror("open");
exit(1);
}
for (i = 0; i < 10000000; i++) {
if (pread(fd, buf, BUFSIZE, 0) != BUFSIZE) {
perror("pread");
exit(1);
}
}
unlink(tmpfile);
return 0;
}
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Diffstat (limited to 'arch/powerpc/lib')
-rw-r--r-- | arch/powerpc/lib/copyuser_64.S | 80 |
1 files changed, 57 insertions, 23 deletions
diff --git a/arch/powerpc/lib/copyuser_64.S b/arch/powerpc/lib/copyuser_64.S index 693b14a778fa..578b625d6a3c 100644 --- a/arch/powerpc/lib/copyuser_64.S +++ b/arch/powerpc/lib/copyuser_64.S | |||
@@ -44,37 +44,55 @@ BEGIN_FTR_SECTION | |||
44 | andi. r0,r4,7 | 44 | andi. r0,r4,7 |
45 | bne .Lsrc_unaligned | 45 | bne .Lsrc_unaligned |
46 | END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) | 46 | END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) |
47 | srdi r7,r5,4 | 47 | blt cr1,.Ldo_tail /* if < 16 bytes to copy */ |
48 | 20: ld r9,0(r4) | 48 | srdi r0,r5,5 |
49 | addi r4,r4,-8 | 49 | cmpdi cr1,r0,0 |
50 | mtctr r7 | 50 | 20: ld r7,0(r4) |
51 | andi. r5,r5,7 | 51 | 220: ld r6,8(r4) |
52 | bf cr7*4+0,22f | 52 | addi r4,r4,16 |
53 | addi r3,r3,8 | 53 | mtctr r0 |
54 | addi r4,r4,8 | 54 | andi. r0,r5,0x10 |
55 | mr r8,r9 | 55 | beq 22f |
56 | blt cr1,72f | 56 | addi r3,r3,16 |
57 | 21: ld r9,8(r4) | 57 | addi r4,r4,-16 |
58 | 70: std r8,8(r3) | 58 | mr r9,r7 |
59 | 22: ldu r8,16(r4) | 59 | mr r8,r6 |
60 | 71: stdu r9,16(r3) | 60 | beq cr1,72f |
61 | 21: ld r7,16(r4) | ||
62 | 221: ld r6,24(r4) | ||
63 | addi r4,r4,32 | ||
64 | 70: std r9,0(r3) | ||
65 | 270: std r8,8(r3) | ||
66 | 22: ld r9,0(r4) | ||
67 | 222: ld r8,8(r4) | ||
68 | 71: std r7,16(r3) | ||
69 | 271: std r6,24(r3) | ||
70 | addi r3,r3,32 | ||
61 | bdnz 21b | 71 | bdnz 21b |
62 | 72: std r8,8(r3) | 72 | 72: std r9,0(r3) |
73 | 272: std r8,8(r3) | ||
74 | andi. r5,r5,0xf | ||
63 | beq+ 3f | 75 | beq+ 3f |
64 | addi r3,r3,16 | 76 | addi r4,r4,16 |
65 | .Ldo_tail: | 77 | .Ldo_tail: |
66 | bf cr7*4+1,1f | 78 | addi r3,r3,16 |
67 | 23: lwz r9,8(r4) | 79 | bf cr7*4+0,246f |
80 | 244: ld r9,0(r4) | ||
81 | addi r4,r4,8 | ||
82 | 245: std r9,0(r3) | ||
83 | addi r3,r3,8 | ||
84 | 246: bf cr7*4+1,1f | ||
85 | 23: lwz r9,0(r4) | ||
68 | addi r4,r4,4 | 86 | addi r4,r4,4 |
69 | 73: stw r9,0(r3) | 87 | 73: stw r9,0(r3) |
70 | addi r3,r3,4 | 88 | addi r3,r3,4 |
71 | 1: bf cr7*4+2,2f | 89 | 1: bf cr7*4+2,2f |
72 | 44: lhz r9,8(r4) | 90 | 44: lhz r9,0(r4) |
73 | addi r4,r4,2 | 91 | addi r4,r4,2 |
74 | 74: sth r9,0(r3) | 92 | 74: sth r9,0(r3) |
75 | addi r3,r3,2 | 93 | addi r3,r3,2 |
76 | 2: bf cr7*4+3,3f | 94 | 2: bf cr7*4+3,3f |
77 | 45: lbz r9,8(r4) | 95 | 45: lbz r9,0(r4) |
78 | 75: stb r9,0(r3) | 96 | 75: stb r9,0(r3) |
79 | 3: li r3,0 | 97 | 3: li r3,0 |
80 | blr | 98 | blr |
@@ -220,7 +238,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) | |||
220 | 131: | 238 | 131: |
221 | addi r3,r3,8 | 239 | addi r3,r3,8 |
222 | 120: | 240 | 120: |
241 | 320: | ||
223 | 122: | 242 | 122: |
243 | 322: | ||
224 | 124: | 244 | 124: |
225 | 125: | 245 | 125: |
226 | 126: | 246 | 126: |
@@ -229,9 +249,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) | |||
229 | 129: | 249 | 129: |
230 | 133: | 250 | 133: |
231 | addi r3,r3,8 | 251 | addi r3,r3,8 |
232 | 121: | ||
233 | 132: | 252 | 132: |
234 | addi r3,r3,8 | 253 | addi r3,r3,8 |
254 | 121: | ||
255 | 321: | ||
256 | 344: | ||
235 | 134: | 257 | 134: |
236 | 135: | 258 | 135: |
237 | 138: | 259 | 138: |
@@ -303,18 +325,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) | |||
303 | 183: | 325 | 183: |
304 | add r3,r3,r7 | 326 | add r3,r3,r7 |
305 | b 1f | 327 | b 1f |
328 | 371: | ||
306 | 180: | 329 | 180: |
307 | addi r3,r3,8 | 330 | addi r3,r3,8 |
308 | 171: | 331 | 171: |
309 | 177: | 332 | 177: |
310 | addi r3,r3,8 | 333 | addi r3,r3,8 |
311 | 170: | 334 | 370: |
312 | 172: | 335 | 372: |
313 | 176: | 336 | 176: |
314 | 178: | 337 | 178: |
315 | addi r3,r3,4 | 338 | addi r3,r3,4 |
316 | 185: | 339 | 185: |
317 | addi r3,r3,4 | 340 | addi r3,r3,4 |
341 | 170: | ||
342 | 172: | ||
343 | 345: | ||
318 | 173: | 344 | 173: |
319 | 174: | 345 | 174: |
320 | 175: | 346 | 175: |
@@ -341,11 +367,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) | |||
341 | .section __ex_table,"a" | 367 | .section __ex_table,"a" |
342 | .align 3 | 368 | .align 3 |
343 | .llong 20b,120b | 369 | .llong 20b,120b |
370 | .llong 220b,320b | ||
344 | .llong 21b,121b | 371 | .llong 21b,121b |
372 | .llong 221b,321b | ||
345 | .llong 70b,170b | 373 | .llong 70b,170b |
374 | .llong 270b,370b | ||
346 | .llong 22b,122b | 375 | .llong 22b,122b |
376 | .llong 222b,322b | ||
347 | .llong 71b,171b | 377 | .llong 71b,171b |
378 | .llong 271b,371b | ||
348 | .llong 72b,172b | 379 | .llong 72b,172b |
380 | .llong 272b,372b | ||
381 | .llong 244b,344b | ||
382 | .llong 245b,345b | ||
349 | .llong 23b,123b | 383 | .llong 23b,123b |
350 | .llong 73b,173b | 384 | .llong 73b,173b |
351 | .llong 44b,144b | 385 | .llong 44b,144b |