diff options
author | Anton Blanchard <anton@samba.org> | 2012-05-27 15:54:03 -0400 |
---|---|---|
committer | Benjamin Herrenschmidt <benh@kernel.crashing.org> | 2012-07-03 00:14:41 -0400 |
commit | 17968fbbd19f1bb281ee4eb2548764ac5664c4ec (patch) | |
tree | c6b7a68ea7897e6bf213bffa4795fe209609cba9 | |
parent | d136e27326a3bd50d7929a43c018abf13e426b7e (diff) |
powerpc: 64bit optimised __clear_user
I noticed __clear_user high up in a profile of one of my RAID stress
tests. The testcase was doing a dd from /dev/zero which ends up
calling __clear_user.
__clear_user is basically a loop with a single 4 byte store which
is horribly slow. We can do much better by aligning the desination
and doing 32 bytes of 8 byte stores in a loop.
The following testcase was used to verify the patch:
http://ozlabs.org/~anton/junkcode/stress_clear_user.c
To show the improvement in performance I ran a dd from /dev/zero
to /dev/null on a POWER7 box:
Before:
# dd if=/dev/zero of=/dev/null bs=1M count=10000
10485760000 bytes (10 GB) copied, 3.72379 s, 2.8 GB/s
After:
# time dd if=/dev/zero of=/dev/null bs=1M count=10000
10485760000 bytes (10 GB) copied, 0.728318 s, 14.4 GB/s
Over 5x faster.
Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
-rw-r--r-- | arch/powerpc/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/powerpc/lib/string.S | 2 | ||||
-rw-r--r-- | arch/powerpc/lib/string_64.S | 141 |
3 files changed, 144 insertions, 1 deletions
diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 7735a2c2e6d9..f049e339e456 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile | |||
@@ -17,7 +17,7 @@ obj-$(CONFIG_HAS_IOMEM) += devres.o | |||
17 | obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ | 17 | obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ |
18 | memcpy_64.o usercopy_64.o mem_64.o string.o \ | 18 | memcpy_64.o usercopy_64.o mem_64.o string.o \ |
19 | checksum_wrappers_64.o hweight_64.o \ | 19 | checksum_wrappers_64.o hweight_64.o \ |
20 | copyuser_power7.o | 20 | copyuser_power7.o string_64.o |
21 | obj-$(CONFIG_XMON) += sstep.o ldstfp.o | 21 | obj-$(CONFIG_XMON) += sstep.o ldstfp.o |
22 | obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o | 22 | obj-$(CONFIG_KPROBES) += sstep.o ldstfp.o |
23 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o | 23 | obj-$(CONFIG_HAVE_HW_BREAKPOINT) += sstep.o ldstfp.o |
diff --git a/arch/powerpc/lib/string.S b/arch/powerpc/lib/string.S index 093d6316435c..1b5a0a09d609 100644 --- a/arch/powerpc/lib/string.S +++ b/arch/powerpc/lib/string.S | |||
@@ -119,6 +119,7 @@ _GLOBAL(memchr) | |||
119 | 2: li r3,0 | 119 | 2: li r3,0 |
120 | blr | 120 | blr |
121 | 121 | ||
122 | #ifdef CONFIG_PPC32 | ||
122 | _GLOBAL(__clear_user) | 123 | _GLOBAL(__clear_user) |
123 | addi r6,r3,-4 | 124 | addi r6,r3,-4 |
124 | li r3,0 | 125 | li r3,0 |
@@ -160,3 +161,4 @@ _GLOBAL(__clear_user) | |||
160 | PPC_LONG 1b,91b | 161 | PPC_LONG 1b,91b |
161 | PPC_LONG 8b,92b | 162 | PPC_LONG 8b,92b |
162 | .text | 163 | .text |
164 | #endif | ||
diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S new file mode 100644 index 000000000000..6613b9047005 --- /dev/null +++ b/arch/powerpc/lib/string_64.S | |||
@@ -0,0 +1,141 @@ | |||
1 | /* | ||
2 | * This program is free software; you can redistribute it and/or modify | ||
3 | * it under the terms of the GNU General Public License as published by | ||
4 | * the Free Software Foundation; either version 2 of the License, or | ||
5 | * (at your option) any later version. | ||
6 | * | ||
7 | * This program is distributed in the hope that it will be useful, | ||
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
10 | * GNU General Public License for more details. | ||
11 | * | ||
12 | * You should have received a copy of the GNU General Public License | ||
13 | * along with this program; if not, write to the Free Software | ||
14 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
15 | * | ||
16 | * Copyright (C) IBM Corporation, 2012 | ||
17 | * | ||
18 | * Author: Anton Blanchard <anton@au.ibm.com> | ||
19 | */ | ||
20 | |||
21 | #include <asm/ppc_asm.h> | ||
22 | |||
23 | /** | ||
24 | * __clear_user: - Zero a block of memory in user space, with less checking. | ||
25 | * @to: Destination address, in user space. | ||
26 | * @n: Number of bytes to zero. | ||
27 | * | ||
28 | * Zero a block of memory in user space. Caller must check | ||
29 | * the specified block with access_ok() before calling this function. | ||
30 | * | ||
31 | * Returns number of bytes that could not be cleared. | ||
32 | * On success, this will be zero. | ||
33 | */ | ||
34 | |||
35 | .macro err1 | ||
36 | 100: | ||
37 | .section __ex_table,"a" | ||
38 | .align 3 | ||
39 | .llong 100b,.Ldo_err1 | ||
40 | .previous | ||
41 | .endm | ||
42 | |||
43 | .macro err2 | ||
44 | 200: | ||
45 | .section __ex_table,"a" | ||
46 | .align 3 | ||
47 | .llong 200b,.Ldo_err2 | ||
48 | .previous | ||
49 | .endm | ||
50 | |||
51 | .macro err3 | ||
52 | 300: | ||
53 | .section __ex_table,"a" | ||
54 | .align 3 | ||
55 | .llong 300b,.Ldo_err3 | ||
56 | .previous | ||
57 | .endm | ||
58 | |||
59 | .Ldo_err1: | ||
60 | mr r3,r8 | ||
61 | |||
62 | .Ldo_err2: | ||
63 | mtctr r4 | ||
64 | 1: | ||
65 | err3; stb r0,0(r3) | ||
66 | addi r3,r3,1 | ||
67 | addi r4,r4,-1 | ||
68 | bdnz 1b | ||
69 | |||
70 | .Ldo_err3: | ||
71 | mr r3,r4 | ||
72 | blr | ||
73 | |||
74 | _GLOBAL(__clear_user) | ||
75 | cmpdi r4,32 | ||
76 | neg r6,r3 | ||
77 | li r0,0 | ||
78 | blt .Lshort_clear | ||
79 | mr r8,r3 | ||
80 | mtocrf 0x01,r6 | ||
81 | clrldi r6,r6,(64-3) | ||
82 | |||
83 | /* Get the destination 8 byte aligned */ | ||
84 | bf cr7*4+3,1f | ||
85 | err1; stb r0,0(r3) | ||
86 | addi r3,r3,1 | ||
87 | |||
88 | 1: bf cr7*4+2,2f | ||
89 | err1; sth r0,0(r3) | ||
90 | addi r3,r3,2 | ||
91 | |||
92 | 2: bf cr7*4+1,3f | ||
93 | err1; stw r0,0(r3) | ||
94 | addi r3,r3,4 | ||
95 | |||
96 | 3: sub r4,r4,r6 | ||
97 | srdi r6,r4,5 | ||
98 | cmpdi r4,32 | ||
99 | blt .Lshort_clear | ||
100 | mtctr r6 | ||
101 | |||
102 | /* Do 32 byte chunks */ | ||
103 | 4: | ||
104 | err2; std r0,0(r3) | ||
105 | err2; std r0,8(r3) | ||
106 | err2; std r0,16(r3) | ||
107 | err2; std r0,24(r3) | ||
108 | addi r3,r3,32 | ||
109 | addi r4,r4,-32 | ||
110 | bdnz 4b | ||
111 | |||
112 | .Lshort_clear: | ||
113 | /* up to 31 bytes to go */ | ||
114 | cmpdi r4,16 | ||
115 | blt 6f | ||
116 | err2; std r0,0(r3) | ||
117 | err2; std r0,8(r3) | ||
118 | addi r3,r3,16 | ||
119 | addi r4,r4,-16 | ||
120 | |||
121 | /* Up to 15 bytes to go */ | ||
122 | 6: mr r8,r3 | ||
123 | clrldi r4,r4,(64-4) | ||
124 | mtocrf 0x01,r4 | ||
125 | bf cr7*4+0,7f | ||
126 | err1; std r0,0(r3) | ||
127 | addi r3,r3,8 | ||
128 | |||
129 | 7: bf cr7*4+1,8f | ||
130 | err1; stw r0,0(r3) | ||
131 | addi r3,r3,4 | ||
132 | |||
133 | 8: bf cr7*4+2,9f | ||
134 | err1; sth r0,0(r3) | ||
135 | addi r3,r3,2 | ||
136 | |||
137 | 9: bf cr7*4+3,10f | ||
138 | err1; stb r0,0(r3) | ||
139 | |||
140 | 10: li r3,0 | ||
141 | blr | ||